Python ActorCritic примеры использования

Язык программирования: Python

Пространство имен/Пакет: algorithms.a3c.model

Класс/Тип: ActorCritic

Примеров на hotexamples.com: 6

Python ActorCritic - 6 примеров найдено. Это лучшие примеры Python кода для algorithms.a3c.model.ActorCritic, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ActorCritic(6)

load_state_dict(5)

cuda(4)

eval(3)

parameters(3)

state_dict(2)

to(2)

train(2)

share_memory(1)

Пример #1

Показать файл

Файл: train.py Проект: kamikaze0923/ai2thor_explore

def train(rank, args, shared_model, counter, lock, optimizer):
    torch.manual_seed(args.seed + rank)
    env = AI2ThorEnv(config_dict=args.config_dict)
    env.seed(args.seed + rank)

    if args.point_cloud_model:
        model = ActorCritic(env.action_space.n)
    else:
        args.frame_dim = env.config['resolution'][-1]
        model = ActorCritic(env.action_space.n, env.observation_space.shape[0],
                            args.frame_dim)

    if args.cuda:
        model = model.cuda()
    model.train()

    state = env.reset()
    done = True

    # monitoring
    total_reward_for_num_steps_list = []
    episode_total_rewards_list = []
    avg_reward_for_num_steps_list = []

    total_length = 0
    episode_length = 0
    n_episode = 0
    total_reward_for_episode = 0
    all_rewards_in_episode = []
    while True:
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())

        if done:
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            episode_length += 1
            total_length += 1
            if args.cuda:
                if args.point_cloud_model:
                    state = (state[0].cuda(), state[1].cuda())
                else:
                    state = state.cuda()
                cx = cx.cuda()
                hx = hx.cuda()
            value, logit, (hx, cx) = model((state, (hx, cx)))
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).detach()
            log_prob = log_prob.gather(1, action)
            log_probs.append(log_prob)

            action_int = action.cpu().numpy()[0][0].item()

            state, reward, done, _ = env.step(action_int, verbose=False)
            done = done or episode_length >= args.max_episode_length

            with lock:
                counter.value += 1

            if done:
                total_length -= 1
                total_reward_for_episode = sum(all_rewards_in_episode)
                episode_total_rewards_list.append(total_reward_for_episode)
                all_rewards_in_episode = []
                state = env.reset()
                print(
                    'Process {} Episode {} Over with Length: {} and Reward: {: .2f}. Total Trained Length: {}'
                    .format(rank, n_episode, episode_length,
                            total_reward_for_episode, total_length))
                sys.stdout.flush()
                episode_length = 0
                n_episode += 1

            values.append(value)
            rewards.append(reward)
            all_rewards_in_episode.append(reward)

            if done:
                break

        if args.synchronous:
            if total_reward_for_episode >= args.solved_reward:
                print("Process {} Solved with Reward {}".format(
                    rank, total_reward_for_episode))
                env.close()
                break

        total_reward_for_num_steps = sum(rewards)
        total_reward_for_num_steps_list.append(total_reward_for_num_steps)
        avg_reward_for_num_steps = total_reward_for_num_steps / len(rewards)
        avg_reward_for_num_steps_list.append(avg_reward_for_num_steps)

        # Backprop and optimisation
        R = torch.zeros(1, 1)
        gae = torch.zeros(1, 1)
        if args.cuda:
            if args.point_cloud_model:
                state = (state[0].cuda(), state[1].cuda())
            else:
                state = state.cuda()
            R = R.cuda()
            gae = gae.cuda()
        if not done:  # to change last reward to predicted value to ....
            value, _, _ = model((state, (hx, cx)))
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        # import pdb;pdb.set_trace() # good place to breakpoint to see training cycle

        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimation
            delta_t = rewards[i] + args.gamma * values[i + 1] - values[i]
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - log_probs[i] * gae.detach() - \
                          args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        (policy_loss + args.value_loss_coef * value_loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()

Пример #2

Показать файл

Файл: main.py Проект: kamikaze0923/ai2thor_explore

    args = parser.parse_args()
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    if args.cuda:
        print('Using', torch.cuda.get_device_name(0))
        torch.cuda.init()

    torch.manual_seed(args.seed)
    args.config_dict = {
        'max_episode_length': args.max_episode_length,
        'point_cloud_model': args.point_cloud_model
    }
    env = AI2ThorEnv(config_dict=args.config_dict)

    if args.point_cloud_model:
        shared_model = ActorCritic(env.action_space.n)
    else:
        args.frame_dim = env.config['resolution'][-1]
        shared_model = ActorCritic(env.action_space.n,
                                   env.observation_space.shape[0],
                                   args.frame_dim)

    if args.cuda:
        shared_model = shared_model.cuda()
    shared_model.share_memory()

    env.close(
    )  # above env initialisation was only to find certain params needed

    optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr)
    optimizer.share_memory()

Пример #3

Показать файл

def test(rank, args, shared_model, counter):
    torch.manual_seed(args.seed + rank)

    if args.atari:
        env = create_atari_env(args.atari_env_name)
    else:
        args.config_dict = {'max_episode_length': args.max_episode_length}
        env = AI2ThorEnv(config_dict=args.config_dict)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space.n,
                        args.frame_dim)

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True  # be True at the beginning

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0
    episodes = 0

    vis = Visdom()
    assert vis.check_connection()
    vis.close()
    win = vis.line(X=[0.],
                   Y=[0.],
                   win='testing_Rewards',
                   opts=dict(title='testing_Rewards'))

    while True:
        episode_length += 1
        if args.atari and args.atari_render:
            env.render()
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        with torch.no_grad():
            value, logit, (hx, cx) = model(
                (state.unsqueeze(0).float(), (hx, cx)))

        prob = F.softmax(logit, dim=-1)
        action = prob.max(1, keepdim=True)[1].numpy()

        state, reward, done, _ = env.step(action[0, 0])
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        # i.e. in test mode an agent can repeat an action ad infinitum
        actions.append(action[0, 0])
        if actions.count(actions[0]) == actions.maxlen:
            print(
                'In test. Episode over because agent repeated action {} times'.
                format(actions.maxlen))
            done = True

        if done:
            print(
                "In test. Time {}, num steps over all threads {}, FPS {:.0f}, episode reward {}, episode length {}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    counter.value, counter.value / (time.time() - start_time),
                    reward_sum, episode_length))
            vis.line(X=[episodes],
                     Y=[reward_sum],
                     win='testing_Rewards',
                     update='append')
            episodes += 1

            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(args.test_sleep_time)  # wasting resource...
            print('testing...')

        state = torch.from_numpy(state)

Пример #4

Показать файл

Файл: test.py Проект: kamikaze0923/ai2thor_explore

def test(rank, args, shared_model, counter):
    torch.manual_seed(args.seed + rank)
    env = AI2ThorEnv(config_dict=args.config_dict)
    env.seed(args.seed + rank)

    if args.point_cloud_model:
        model = ActorCritic(env.action_space.n)
    else:
        args.frame_dim = env.config['resolution'][-1]
        model = ActorCritic(env.action_space.n, env.observation_space.shape[0],
                            args.frame_dim)

    if args.cuda:
        model = model.cuda()

    model.eval()

    state = env.reset()
    reward_sum = 0
    done = True

    save = '{}-steps{}-process{}-lr{}-entropy_coef{}'.format(
        "point" if args.point_cloud_model else "conv", args.num_steps,
        args.num_processes, args.lr, args.entropy_coef)
    save = os.path.join('logs', save)
    os.makedirs(save, exist_ok=True)

    if args.model:
        shared_model.load_state_dict(
            torch.load(os.path.join(save, "solved_ai2thor.pth")))
    else:
        logger = CSVLogger(os.path.join(save, 'test.csv'))
        fileds = ['episode_reward', 'frames_rendered']
        logger.log(fileds)

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    # actions = deque(maxlen=100)
    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(deepcopy(shared_model.state_dict()))
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        with torch.no_grad():
            if args.cuda:
                if args.point_cloud_model:
                    state = (state[0].cuda(), state[1].cuda())
                else:
                    state = state.cuda()
                cx = cx.cuda()
                hx = hx.cuda()
            value, logit, (hx, cx) = model((state, (hx, cx)))
        prob = F.softmax(logit, dim=-1)
        # log_prob = F.log_softmax(logit, dim=-1)
        # print(prob)
        # entropy = -(log_prob * prob).sum(1, keepdim=True)
        # print(prob.max(1, keepdim=True)[0].cpu().numpy())
        # print(entropy)

        action = prob.max(1, keepdim=True)[1].cpu().numpy()
        state, reward, done, _ = env.step(action[0, 0], verbose=False)
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        # i.e. in test mode an agent can repeat an action ad infinitum
        # actions.append(action[0, 0])
        # if actions.count(actions[0]) == actions.maxlen:
        #     print('In test. Episode over because agent repeated action {} times'.format(
        #                                                                         actions.maxlen))
        #     done = True

        if done:
            print(
                "Time {}, num steps over all threads {}, FPS {:.0f}, episode reward {: .2f}, episode length {}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    counter.value, counter.value / (time.time() - start_time),
                    reward_sum, episode_length))
            if not args.model:
                logger.log(["{: .2f}".format(reward_sum), counter.value])

            if reward_sum >= args.solved_reward:
                print("Solved Testing with Reward {}".format(reward_sum))
                torch.save(model.state_dict(),
                           os.path.join(save, "solved_ai2thor.pth"))
                env.close()
                logger.close()
                break

            reward_sum = 0
            episode_length = 0
            # actions.clear()
            state = env.reset()
            time.sleep(args.test_sleep_time)

Пример #5

Показать файл

def train(rank, args, shared_model, counter, lock, device, optimizer=None):
    torch.manual_seed(args.seed + rank)

    if args.atari:
        env = create_atari_env(args.atari_env_name)
    else:
        args.config_dict = {'max_episode_length': args.max_episode_length}
        env = AI2ThorEnv(config_dict=args.config_dict)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space.n,
                        args.frame_dim)
    model = model.to(device)

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()

    state = env.reset()
    state = torch.from_numpy(state)
    done = True

    # monitoring
    total_reward_for_num_steps_list = []
    episode_total_rewards_list = []
    all_rewards_in_episode = []
    avg_reward_for_num_steps_list = []

    total_length = 0
    episode_length = 0
    episodes = 0

    vis = Visdom()
    assert vis.check_connection()
    vis.close()
    vis.line(X=[0.],
             Y=[0.],
             win='training_Rewards' + str(rank),
             opts=dict(title='training_Rewards' + str(rank)))

    while True:
        episodes += 1
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            episode_length += 1
            total_length += 1
            value, logit, (hx, cx) = model(
                (state.unsqueeze(0).float().cuda(), (hx.cuda(), cx.cuda())))
            value = value.cpu()
            logit = logit.cpu()
            hx = hx.cpu()
            cx = cx.cpu()
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).detach()
            log_prob = log_prob.gather(1, action)

            action_int = action.numpy()[0][0].item()
            state, reward, done, _ = env.step(action_int)

            done = done or episode_length >= args.max_episode_length

            with lock:
                counter.value += 1

            if done:
                total_length -= 1
                total_reward_for_episode = sum(all_rewards_in_episode)
                episode_total_rewards_list.append(total_reward_for_episode)
                all_rewards_in_episode = []

                vis.line(X=[episodes],
                         Y=[total_reward_for_episode],
                         win='training_Rewards' + str(rank),
                         update='append')
                print(
                    'In Train. Episode Over. Total Length: {}. Total reward for episode: {}'
                    .format(total_length, total_reward_for_episode))
                print('In Train. Step no: {}. total length: {}'.format(
                    episode_length, total_length))

                episode_length = 0
                state = env.reset()

            state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)
            all_rewards_in_episode.append(reward)

            if done:
                break

        # No interaction with environment below.
        # Monitoring
        total_reward_for_num_steps = sum(rewards)  # accumulate at each step
        total_reward_for_num_steps_list.append(total_reward_for_num_steps)
        avg_reward_for_num_steps = total_reward_for_num_steps / len(rewards)
        avg_reward_for_num_steps_list.append(avg_reward_for_num_steps)

        # Backprop and optimisation
        R = torch.zeros(1, 1)
        if not done:  # to change last reward to predicted value to ....
            value, _, _ = model((state.unsqueeze(0).float(), (hx, cx)))
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        # import pdb;pdb.set_trace() # good place to breakpoint to see training cycle
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimation
            delta_t = rewards[i] + args.gamma * values[i + 1] - values[i]
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - log_probs[i] * gae.detach() - \
                          args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        (policy_loss + args.value_loss_coef * value_loss).cuda().backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()

Пример #6

Показать файл

Файл: test.py Проект: kamikaze0923/cups-rl

def test(rank, args, shared_model, counter):
    torch.manual_seed(args.seed + rank)

    if args.atari:
        env = create_atari_env(args.atari_env_name)
    else:
        args.config_dict = {'max_episode_length': args.max_episode_length}
        env = AI2ThorEnv(config_dict=args.config_dict)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space.n, args.frame_dim)
    if args.cuda:
        model = model.cuda()

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        episode_length += 1
        if args.atari and args.atari_render:
            env.render()
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = torch.zeros(1, 64)
            hx = torch.zeros(1, 64)
        else:
            cx = cx.detach()
            hx = hx.detach()

        with torch.no_grad():
            if args.cuda:
                state = state.cuda()
                cx = cx.cuda()
                hx = hx.cuda()
            value, logit, (hx, cx) = model((state.unsqueeze(0).float(), (hx, cx)))
        prob = F.softmax(logit, dim=-1)
        # log_prob = F.log_softmax(logit, dim=-1)
        # print(prob)
        # entropy = -(log_prob * prob).sum(1, keepdim=True)
        # print(prob.max(1, keepdim=True)[0].cpu().numpy())
        # print(entropy)

        action = prob.max(1, keepdim=True)[1].cpu().numpy()
        state, reward, done, _ = env.step(action[0, 0], verbose=False)
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        # i.e. in test mode an agent can repeat an action ad infinitum
        actions.append(action[0, 0])
        if actions.count(actions[0]) == actions.maxlen:
            print('In test. Episode over because agent repeated action {} times'.format(
                                                                                actions.maxlen))
            done = True

        if done:
            print("Time {}, num steps over all threads {}, FPS {:.0f}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                counter.value, counter.value / (time.time() - start_time),
                reward_sum, episode_length))

            if reward_sum >= args.solved_reward:
                print("Solved Testing with Reward {}".format(reward_sum))
                torch.save(model.state_dict(), "solved_{}.pth".format("atari" if args.atari else "ai2thor"))
                env.close()
                break

            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(args.test_sleep_time)

        state = torch.from_numpy(state)