示例#1
0
def eval_db_agent(env, params):
    if params['use_preproc']:
        preprocessor = Preprocessor(params['state_dim'], params['history'], params['use_luminance'],
                                    params['resize_shape'])
        params['state_dim'] = preprocessor.state_shape
    else:
        preprocessor = None

    agent = VAE(params['state_dim'], params['action_dim'])
    if params['use_cuda']:
        agent = agent.cuda()
        agent.load_state_dict(torch.load('./agents/{0}_{1}'.format(params['arch'], params['env_name'])))
    else:
        agent.load_state_dict(
            torch.load('./agents/{0}_{1}'.format(params['arch'], params['env_name']), map_location='cpu'))
    agent.eval()

    agent_steps = 0
    episode_rewards = []
    start = time.time()
    for episode in xrange(1, params['num_episodes'] + 1):
        env_state = env.reset()
        episode_reward = 0.0
        for t in xrange(1, params['max_steps'] + 1):
            if params['env_render']:
                env.render()

            if preprocessor:
                state = preprocessor.process_state(env_state)
            else:
                state = env_state

            var_state = createVariable(state, use_cuda=params['use_cuda'])
            action, state_val = agent.sample_action_eval(var_state)

            reward = 0.0
            for _ in range(1):
                env_state, r, terminal, _ = env.step(action)
                reward += r
                if terminal:
                    break

            episode_reward += reward

            if terminal:
                break

        episode_rewards.append(episode_reward)
        agent_steps += t

        if preprocessor:
            preprocessor.reset()

        print 'Episode {0} | Total Steps {1} | Total Reward {2} | Mean Reward {3} | Total Time {4}' \
            .format(episode, agent_steps, episode_reward, sum(episode_rewards[-100:]) / 100,
                    timeSince(start, episode / params['num_episodes']))
示例#2
0
def cache_eval_episode(env, params):
    cache_states, cache_distros = [], []

    if params['use_preproc']:
        preprocessor = Preprocessor(params['state_dim'], params['history'],
                                    params['use_luminance'],
                                    params['resize_shape'])
        params['state_dim'] = preprocessor.state_shape
    else:
        preprocessor = None

    agent = agent_lookup(params)

    if params['use_cuda']:
        agent = agent.cuda()
        agent.load_state_dict(
            torch.load('./agents/{0}_{1}'.format(params['arch'],
                                                 params['env_name'])))
    else:
        agent.load_state_dict(
            torch.load('./agents/{0}_{1}'.format(params['arch'],
                                                 params['env_name']),
                       map_location='cpu'))

    agent_steps = 0
    episode_rewards = []
    start = time.time()
    for episode in xrange(1):
        env_state = env.reset()
        episode_reward = 0.0
        for t in xrange(1, params['max_steps'] + 1):
            if params['env_render']:
                env.render()

            if preprocessor:
                state = preprocessor.process_state(env_state)
            else:
                state = env_state

            var_state = createVariable(state, use_cuda=params['use_cuda'])
            action, state_val, distro = agent.sample_action_distro(var_state)
            cache_states.append(state)
            cache_distros.append(distro.cpu().numpy())

            reward = 0.0
            for _ in range(1):
                env_state, r, terminal, _ = env.step(action)
                reward += r
                if terminal:
                    break

            episode_reward += reward

            if terminal:
                break

        episode_rewards.append(episode_reward)
        agent_steps += t

        if preprocessor:
            preprocessor.reset()

        if episode % params['print_every'] == 0:
            print 'Episode {0} | Total Steps {1} | Total Reward {2} | Mean Reward {3}' \
                .format(episode, agent_steps, episode_reward, sum(episode_rewards[-100:]) / 100)

    cache_states, cache_distros = np.array(cache_states), np.array(
        cache_distros)
    pickle.dump((cache_states, cache_distros),
                open(
                    './out/{0}_{1}_episode.pkl'.format(params['arch'],
                                                       params['env_name']),
                    'wb'), -1)
示例#3
0
def cache_abstraction(env, params):
    if os.path.exists('./out/{0}'.format(params['env_name'])):
        shutil.rmtree('./out/{0}'.format(params['env_name']))

    if params['use_preproc']:
        preprocessor = Preprocessor(params['state_dim'], params['history'], params['use_luminance'],
                                    params['resize_shape'])
        params['state_dim'] = preprocessor.state_shape
    else:
        preprocessor = None

    agent = VAE(params['state_dim'], params['action_dim'])
    if params['use_cuda']:
        agent = agent.cuda()
        agent.load_state_dict(torch.load('./agents/{0}_{1}'.format(params['arch'], params['env_name'])))
    else:
        agent.load_state_dict(
            torch.load('./agents/{0}_{1}'.format(params['arch'], params['env_name']), map_location='cpu'))
    agent.eval()

    agent_steps = 0
    episode_rewards = []
    start = time.time()
    for episode in xrange(1):
        env_state = env.reset()
        episode_reward = 0.0
        for t in xrange(1, params['max_steps'] + 1):
            if params['env_render']:
                env.render()

            if preprocessor:
                state = preprocessor.process_state(env_state)
            else:
                state = env_state

            var_state = createVariable(state, use_cuda=params['use_cuda'])
            # action, state_val = agent.sample_action_eval(var_state)
            action, state_val, code = agent.sample_action_eval_code(var_state)

            if not os.path.exists('./out/{0}/{1}'.format(params['env_name'], code)):
                os.makedirs('./out/{0}/{1}'.format(params['env_name'], code))
            preprocessor.get_img_state().save('./out/{0}/{1}/{2}.png'.format(params['env_name'], code, t))

            reward = 0.0
            for _ in range(1):
                env_state, r, terminal, _ = env.step(action)
                reward += r
                if terminal:
                    break

            episode_reward += reward

            if terminal:
                break

        episode_rewards.append(episode_reward)
        agent_steps += t

        if preprocessor:
            preprocessor.reset()

        print 'Episode {0} | Total Steps {1} | Total Reward {2} | Mean Reward {3}' \
            .format(episode, agent_steps, episode_reward, sum(episode_rewards[-100:]) / 100)
示例#4
0
def train_agent(env, params):
    if params['use_preproc']:
        preprocessor = Preprocessor(params['state_dim'], params['history'],
                                    params['use_luminance'],
                                    params['resize_shape'])
        params['state_dim'] = preprocessor.state_shape
    else:
        preprocessor = None

    agent = agent_lookup(params)
    agent.train()

    if params['optim'] == 'rms':
        optimizer = torch.optim.RMSprop(agent.parameters(),
                                        lr=params['learning_rate'])
    elif params['optim'] == 'adam':
        optimizer = torch.optim.Adam(agent.parameters(),
                                     lr=params['learning_rate'])
    else:
        print 'Unknown optimizer specified!'
        sys.exit(0)

    if params['use_cuda']:
        agent = agent.cuda()

    agent_steps = 0
    episode_rewards = []
    start = time.time()
    for episode in xrange(1, params['num_episodes'] + 1):
        env_state = env.reset()
        episode_reward = 0.0
        policy_loss, value_loss = 0.0, 0.0
        num_updates = 0
        for t in xrange(1, params['max_steps'] + 1):
            if params['env_render']:
                env.render()

            if preprocessor:
                state = preprocessor.process_state(env_state)
            else:
                state = env_state

            var_state = createVariable(state, use_cuda=params['use_cuda'])
            action, state_val = agent.sample_action(var_state)

            reward = 0.0
            for _ in range(1):
                env_state, r, terminal, _ = env.step(action)
                reward += r
                if terminal:
                    break

            agent.rewards.append(reward)
            episode_reward += reward

            if terminal:
                agent.final_state_val = 0.0
                break

            if t % params['update_freq'] == 0:
                agent.final_state_val = state_val[0]
                pl, vl = train_step(agent, optimizer, params)
                policy_loss += pl
                value_loss += vl
                num_updates += 1

        episode_rewards.append(episode_reward)
        agent.final_state_val = 0.0
        pl, vl = train_step(agent, optimizer, params)
        policy_loss += pl
        value_loss += vl
        num_updates += 1
        agent_steps += t

        if preprocessor:
            preprocessor.reset()

        if params['arch'] in ['VQ-A2C']:
            visit = len(agent.visited), agent.visited
            agent.visited = set([])
        else:
            visit = 0

        if episode % params['print_every'] == 0:
            print 'Episode {0} | Total Steps {1} | Total Reward {2} | Mean Reward {3} | Policy Loss {4} | Value Loss {6} | Total Time {5} | S_A {7}' \
                .format(episode, agent_steps, episode_reward, sum(episode_rewards[-100:]) / 100,
                        policy_loss / num_updates,
                        timeSince(start, episode / params['num_episodes']), value_loss / num_updates, visit)