Python ModelV2.get_q_value_distributions 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: ray.rllib.models.modelv2

클래스/타입: ModelV2

메소드/함수: get_q_value_distributions

hotexamples.com에서의 예제들: 4

Python ModelV2.get_q_value_distributions - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 ray.rllib.models.modelv2.ModelV2.get_q_value_distributions에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

get_policy_output(17)

from_batch(15)

get_q_values(14)

get_twin_q_values(12)

get_action_model_outputs(7)

__init__(6)

td_error(4)

get_state_value(4)

get_q_value_distributions(4)

is_time_major(3)

custom_loss(2)

choice_model(2)

context(2)

select_state(2)

to(2)

named_parameters(1)

policy_variables(1)

q_model(1)

q_values(1)

q_variables(1)

choose_slate(1)

예제 #1

파일 보기

파일: dqn_torch_policy.py 프로젝트: wuisawesome/ray

def compute_q_values(
    policy: Policy,
    model: ModelV2,
    input_dict,
    state_batches=None,
    seq_lens=None,
    explore=None,
    is_training: bool = False,
):
    config = policy.config

    model_out, state = model(input_dict, state_batches or [], seq_lens)

    if config["num_atoms"] > 1:
        (
            action_scores,
            z,
            support_logits_per_action,
            logits,
            probs_or_logits,
        ) = model.get_q_value_distributions(model_out)
    else:
        (action_scores, logits, probs_or_logits) = model.get_q_value_distributions(
            model_out
        )

    if config["dueling"]:
        state_score = model.get_state_value(model_out)
        if policy.config["num_atoms"] > 1:
            support_logits_per_action_mean = torch.mean(
                support_logits_per_action, dim=1
            )
            support_logits_per_action_centered = (
                support_logits_per_action
                - torch.unsqueeze(support_logits_per_action_mean, dim=1)
            )
            support_logits_per_action = (
                torch.unsqueeze(state_score, dim=1) + support_logits_per_action_centered
            )
            support_prob_per_action = nn.functional.softmax(
                support_logits_per_action, dim=-1
            )
            value = torch.sum(z * support_prob_per_action, dim=-1)
            logits = support_logits_per_action
            probs_or_logits = support_prob_per_action
        else:
            advantages_mean = reduce_mean_ignore_inf(action_scores, 1)
            advantages_centered = action_scores - torch.unsqueeze(advantages_mean, 1)
            value = state_score + advantages_centered
    else:
        value = action_scores

    return value, logits, probs_or_logits, state

예제 #2

파일 보기

def compute_q_values(
    policy: Policy,
    model: ModelV2,
    input_batch: SampleBatch,
    state_batches=None,
    seq_lens=None,
    explore=None,
    is_training: bool = False,
):

    config = policy.config

    model_out, state = model(input_batch, state_batches or [], seq_lens)

    if config["num_atoms"] > 1:
        (
            action_scores,
            z,
            support_logits_per_action,
            logits,
            dist,
        ) = model.get_q_value_distributions(model_out)
    else:
        (action_scores, logits, dist) = model.get_q_value_distributions(model_out)

    if config["dueling"]:
        state_score = model.get_state_value(model_out)
        if config["num_atoms"] > 1:
            support_logits_per_action_mean = tf.reduce_mean(
                support_logits_per_action, 1
            )
            support_logits_per_action_centered = (
                support_logits_per_action
                - tf.expand_dims(support_logits_per_action_mean, 1)
            )
            support_logits_per_action = (
                tf.expand_dims(state_score, 1) + support_logits_per_action_centered
            )
            support_prob_per_action = tf.nn.softmax(logits=support_logits_per_action)
            value = tf.reduce_sum(input_tensor=z * support_prob_per_action, axis=-1)
            logits = support_logits_per_action
            dist = support_prob_per_action
        else:
            action_scores_mean = reduce_mean_ignore_inf(action_scores, 1)
            action_scores_centered = action_scores - tf.expand_dims(
                action_scores_mean, 1
            )
            value = state_score + action_scores_centered
    else:
        value = action_scores

    return value, logits, dist, state

예제 #3

파일 보기

def compute_q_values(policy: Policy,
                     model: ModelV2,
                     obs: TensorType,
                     explore,
                     is_training: bool = False):
    config = policy.config

    model_out, state = model(
        {
            SampleBatch.CUR_OBS: obs,
            "is_training": is_training,
        }, [], None)

    if config["num_atoms"] > 1:
        (action_scores, z, support_logits_per_action, logits,
         probs_or_logits) = model.get_q_value_distributions(model_out)
    else:
        (action_scores, logits,
         probs_or_logits) = model.get_q_value_distributions(model_out)

    if config["dueling"]:
        state_score = model.get_state_value(model_out)
        if policy.config["num_atoms"] > 1:
            support_logits_per_action_mean = torch.mean(
                support_logits_per_action, dim=1)
            support_logits_per_action_centered = (
                support_logits_per_action -
                torch.unsqueeze(support_logits_per_action_mean, dim=1))
            support_logits_per_action = torch.unsqueeze(
                state_score, dim=1) + support_logits_per_action_centered
            support_prob_per_action = nn.functional.softmax(
                support_logits_per_action)
            value = torch.sum(z * support_prob_per_action, dim=-1)
            logits = support_logits_per_action
            probs_or_logits = support_prob_per_action
        else:
            advantages_mean = reduce_mean_ignore_inf(action_scores, 1)
            advantages_centered = action_scores - torch.unsqueeze(
                advantages_mean, 1)
            value = state_score + advantages_centered
    else:
        value = action_scores

    return value, logits, probs_or_logits

예제 #4

파일 보기

파일: dqn_tf_policy.py 프로젝트: tuyulers5/jav44

def compute_q_values(policy: Policy, model: ModelV2, obs: TensorType, explore):
    config = policy.config

    model_out, state = model(
        {
            SampleBatch.CUR_OBS: obs,
            "is_training": policy._get_is_training_placeholder(),
        }, [], None)

    if config["num_atoms"] > 1:
        (action_scores, z, support_logits_per_action, logits,
         dist) = model.get_q_value_distributions(model_out)
    else:
        (action_scores, logits,
         dist) = model.get_q_value_distributions(model_out)

    if config["dueling"]:
        state_score = model.get_state_value(model_out)
        if config["num_atoms"] > 1:
            support_logits_per_action_mean = tf.reduce_mean(
                support_logits_per_action, 1)
            support_logits_per_action_centered = (
                support_logits_per_action -
                tf.expand_dims(support_logits_per_action_mean, 1))
            support_logits_per_action = tf.expand_dims(
                state_score, 1) + support_logits_per_action_centered
            support_prob_per_action = tf.nn.softmax(
                logits=support_logits_per_action)
            value = tf.reduce_sum(input_tensor=z * support_prob_per_action,
                                  axis=-1)
            logits = support_logits_per_action
            dist = support_prob_per_action
        else:
            action_scores_mean = reduce_mean_ignore_inf(action_scores, 1)
            action_scores_centered = action_scores - tf.expand_dims(
                action_scores_mean, 1)
            value = state_score + action_scores_centered
    else:
        value = action_scores

    return value, logits, dist