Python Categorical примеры, utils.distributions.Categorical Python примеры использования

Пример #1

0

Показать файл

    def __init__(self,
                 obs_space,
                 share_obs_space,
                 action_space,
                 gain=1,
                 base=None,
                 base_kwargs=None,
                 device=torch.device("cpu")):
        super(Policy, self).__init__()
        self.mixed_action = False
        self.multi_discrete = False
        self.device = device
        if base_kwargs is None:
            base_kwargs = {}

        if obs_space.__class__.__name__ == "Box":
            obs_shape = obs_space.shape
            share_obs_shape = share_obs_space.shape
        elif obs_space.__class__.__name__ == "list":
            obs_shape = obs_space
            share_obs_shape = share_obs_space
        else:
            raise NotImplementedError

        if len(obs_shape) == 3:
            self.base = CNNBase(obs_shape, share_obs_shape, **base_kwargs)
        else:
            self.base = MLPBase(obs_shape, share_obs_shape, **base_kwargs)

        if action_space.__class__.__name__ == "Discrete":
            num_actions = action_space.n
            self.dist = Categorical(self.base.output_size, num_actions, gain)
        elif action_space.__class__.__name__ == "Box":
            num_actions = action_space.shape[0]
            self.dist = DiagGaussian(self.base.output_size, num_actions)
        elif action_space.__class__.__name__ == "MultiBinary":
            num_actions = action_space.shape[0]
            self.dist = Bernoulli(self.base.output_size, num_actions)
        elif action_space.__class__.__name__ == "MultiDiscrete":
            self.multi_discrete = True
            self.discrete_N = action_space.shape
            action_size = action_space.high - action_space.low + 1
            self.dists = []
            for num_actions in action_size:
                self.dists.append(
                    Categorical(self.base.output_size, num_actions, gain))
            self.dists = nn.ModuleList(self.dists)
        else:  # discrete+continous
            self.mixed_action = True
            continous = action_space[0].shape[0]
            discrete = action_space[1].n
            self.dist = nn.ModuleList([
                DiagGaussian(self.base.output_size, continous),
                Categorical(self.base.output_size, discrete, gain)
            ])

Пример #2

0

Показать файл

 def split_obs(obs, x, num_parents):
     node_var: Categorical = obs[x]
     obs_values = []
     for v in zip(node_var.vals):
         obs_copy = [o for o in obs]
         obs_copy[x] = Categorical(v)
         # Copy obs to new nodes
         for i in range(num_parents - 1):
             obs_copy.append(Categorical(v))
         obs_values.append(obs_copy)
     probs = node_var.probs
     return probs, obs_values

Пример #3

0

Показать файл

 def prepare_loss(self):
     with tf.device(self.device):
         print("    [{}]Preparing loss".format(self.id))
         # [Policy distribution]
         if self.is_continuous_control():
             # Old policy
             old_policy_batch = tf.transpose(self.old_policy_batch,
                                             [1, 0, 2])
             old_policy_distributions = Normal(old_policy_batch[0],
                                               old_policy_batch[1])
             # New policy
             new_policy_batch = tf.transpose(self.policy_batch, [1, 0, 2])
             new_policy_distributions = Normal(new_policy_batch[0],
                                               new_policy_batch[1])
         else:  # discrete control
             old_policy_distributions = Categorical(
                 self.old_policy_batch)  # Old policy
             new_policy_distributions = Categorical(
                 self.policy_batch)  # New policy
         # [Actor loss]
         policy_loss_builder = PolicyLoss(
             cliprange=self.clip,
             cross_entropy=new_policy_distributions.cross_entropy(
                 self.old_action_batch),
             old_cross_entropy=old_policy_distributions.cross_entropy(
                 self.old_action_batch),
             advantage=self.advantage_batch,
             # entropy=self.fentropy,
             entropy=new_policy_distributions.entropy(),
             beta=self.beta)
         self.policy_loss = policy_loss_builder.get()
         # [Critic loss]
         value_loss_builder = ValueLoss(cliprange=self.clip,
                                        value=self.value_batch,
                                        old_value=self.old_value_batch,
                                        reward=self.cumulative_reward_batch)
         self.value_loss = flags.value_coefficient * value_loss_builder.get(
         )  # usually critic has lower learning rate
         # [Extra loss]
         self.extra_loss = tf.constant(0.)
         if self.predict_reward:
             self.extra_loss += self._reward_prediction_loss()
         # [Debug variables]
         self.policy_kl_divergence = policy_loss_builder.approximate_kullback_leibler_divergence(
         )
         self.policy_clipping_frequency = policy_loss_builder.get_clipping_frequency(
         )
         self.policy_entropy_contribution = policy_loss_builder.get_entropy_contribution(
         )
         self.total_loss = self.policy_loss + self.value_loss + self.extra_loss

Пример #4

0

Показать файл

Файл: mouselab_metacontroller.py Проект: RationalityEnhancement/SSD_Hierarchical

    def get_goal_distribution(self, goal, n=4):
        """
        Returns the reward distribution for a goal state taking the best path to the goal into account.
        """

        #max_path_reward = self.get_max_path_reward(goal)
        max_path = self.get_max_path(goal, include=True)
        path_reward = [
            self._state[node] if hasattr(self._state[node], "sample") else
            Categorical([self._state[node]]) for node in max_path[1:]
        ]
        reward = Categorical([0])
        for r in path_reward:
            reward += r
        return shrink_categorical(reward, n=n)

Пример #5

0

Показать файл

Файл: ac_algorithm.py Проект: heyitsden/Combining--experience-replay--with--exploration-by-random-network-distillation-

	def prepare_loss(self, global_step):
		self.global_step = global_step
		print( "Preparing loss {}".format(self.id) )
		self.state_value_batch = self.critic_batch
		# [Policy distribution]
		old_policy_distributions = []
		new_policy_distributions = []
		policy_loss_builder = []
		for h,policy_head in enumerate(self.policy_heads):
			if is_continuous_control(policy_head['depth']):
				# Old policy
				old_policy_batch = tf.transpose(self.old_policy_batch[h], [1, 0, 2])
				old_policy_distributions.append( Normal(old_policy_batch[0], old_policy_batch[1]) )
				# New policy
				new_policy_batch = tf.transpose(self.actor_batch[h], [1, 0, 2])
				new_policy_distributions.append( Normal(new_policy_batch[0], new_policy_batch[1]) )
			else: # discrete control
				old_policy_distributions.append( Categorical(self.old_policy_batch[h]) ) # Old policy
				new_policy_distributions.append( Categorical(self.actor_batch[h]) ) # New policy
			builder = self._get_policy_loss_builder(new_policy_distributions[h], old_policy_distributions[h], self.old_action_batch[h], self.old_action_mask_batch[h] if self.has_masked_actions else None)
			policy_loss_builder.append(builder)
		# [Actor loss]
		self.policy_loss = sum(self._get_policy_loss(b) for b in policy_loss_builder)
		# [Debug variables]
		self.policy_kl_divergence = sum(b.approximate_kullback_leibler_divergence() for b in policy_loss_builder)
		self.policy_clipping_frequency = sum(b.get_clipping_frequency() for b in policy_loss_builder)/len(policy_loss_builder) # take average because clipping frequency must be in [0,1]
		self.policy_entropy_regularization = sum(b.get_entropy_regularization() for b in policy_loss_builder)
		# [Critic loss]
		value_loss_builder = self._get_value_loss_builder()
		self.value_loss = self._get_value_loss(value_loss_builder)
		# [Entropy regularization]
		if flags.entropy_regularization:
			self.policy_loss += -self.policy_entropy_regularization
		# [Constraining Replay]
		if self.constrain_replay:
			constrain_loss = sum(
				0.5*builder.reduce_function(tf.squared_difference(new_distribution.mean(), tf.stop_gradient(old_action))) 
				for builder, new_distribution, old_action in zip(policy_loss_builder, new_policy_distributions, self.old_action_batch)
			)
			self.policy_loss += tf.cond(
				pred=self.is_replayed_batch[0], 
				true_fn=lambda: constrain_loss,
				false_fn=lambda: tf.constant(0., dtype=self.parameters_type)
			)
		# [Total loss]
		self.total_loss = self.policy_loss + self.value_loss
		if flags.intrinsic_reward:
			self.total_loss += self.intrinsic_reward_loss

Пример #6

0

Показать файл

Файл: model.py Проект: SJingwen/Neural-SLAM

    def __init__(self,
                 obs_shape,
                 action_space,
                 model_type=0,
                 base_kwargs=None):

        super(RL_Policy, self).__init__()
        if base_kwargs is None:
            base_kwargs = {}

        if model_type == 0:
            self.network = Global_Policy(obs_shape, **base_kwargs)
        else:
            raise NotImplementedError

        if action_space.__class__.__name__ == "Discrete":
            num_outputs = action_space.n
            self.dist = Categorical(self.network.output_size, num_outputs)
        elif action_space.__class__.__name__ == "Box":
            num_outputs = action_space.shape[0]
            self.dist = DiagGaussian(self.network.output_size, num_outputs)
        else:
            raise NotImplementedError

        self.model_type = model_type

Пример #7

0

Показать файл

Файл: mouselab_hierarichal_simple_VAR.py Проект: RationalityEnhancement/SSD_Hierarchical

    def high_vpi(self, state, bins=4):
        """Returns the high level VPI

        Arguments:
            state: high state for computation
            bins: number of bins to discretize continuous distribution
        """
        dists = []
        for option in range(1, self.no_options +
                            1):  # To get the node distributions
            goal_clicked = self.goals[option - 1][0]
            node = self.low_state[goal_clicked]
            if hasattr(node, 'sample'):
                if hasattr(node, 'mu'):
                    dist = node.to_discrete(n=bins, max_sigma=4)
                    dist.vals = tuple([(round(val, 3)) for val in dist.vals])
                    dist.probs = tuple([(round(p, 3)) for p in dist.probs])
                else:
                    dist = node
            else:
                dist = Categorical(vals=[node], probs=[1])
            dists.append(dist)
        net_dist = self.shrink(dists)
        expected_return = cmax(net_dist).expectation()
        return expected_return - self.expected_high_term_reward(state)

Пример #8

0

Показать файл

    def __init__(self,
                 obs_shape,
                 action_space,
                 num_agents,
                 base=None,
                 base_kwargs=None):
        super(Policy, self).__init__()
        if base_kwargs is None:
            base_kwargs = {}
        if base is None:
            if len(obs_shape) == 3:
                base = CNNBase
                self.base = base(num_agents, obs_shape, **base_kwargs)
            elif len(obs_shape) == 1:
                base = MLPBase
                self.base = base(num_agents, obs_shape[0], **base_kwargs)
            else:
                raise NotImplementedError

        if action_space.__class__.__name__ == "Discrete":
            num_outputs = action_space.n
            self.dist = Categorical(self.base.output_size, num_outputs)
        elif action_space.__class__.__name__ == "Box":
            num_outputs = action_space.shape[0]
            self.dist = DiagGaussian(self.base.output_size, num_outputs)
        elif action_space.__class__.__name__ == "MultiBinary":
            num_outputs = action_space.shape[0]
            self.dist = Bernoulli(self.base.output_size, num_outputs)
        else:
            raise NotImplementedError

Пример #9

0

Показать файл

    def vpi_action(self, action, state) -> 'float, >= -0.001':
        """
        Calculates vpi action. Nodes of importance are those who are either parents or children of the node selected
        """

        # print("Ground Truth = {}".format(self.ground_truth))
        # print("State = {}".format(state))
        # print("Action = {}".format(action))
        option_dist = []
        obs = (*self.subtree[action][0:], *self.path_to(action)[1:])
        obs = list(set(obs))
        for option in range(1, self.no_options + 1):
            op_dist = self.node_value_after_observe_option(option, state, obs)
            node_idx = self.goals[option - 1][0]
            if not hasattr(state[node_idx], 'sample'):
                goal_dist = Categorical(vals=[state[node_idx]], probs=[1])
            else:
                goal_dist = state[node_idx]
            dists = [op_dist, goal_dist]
            option_dist.append(cross_1(dists, sum))

        net_dist = self.shrink(option_dist)
        nvao = float(cmax(net_dist, default=ZERO).expectation())

        # print(obs)
        # print("Env.state = {}".format(state))
        # for _,i in enumerate(state):
        #     print(i)
        # print("Expected Term Reward = {}".format(self.expected_term_reward(state)))
        # print("Observe Node Expected = {}".format(self.node_value_after_observe(obs, 0, state,verbose).expectation()))
        result = nvao - self.expected_term_reward_disc(state)
        if abs(result) < 0.001:
            result = 0.0

        return result

Пример #10

0

Показать файл

Файл: mouselab_hierarichal_simple_VAR.py Проект: RationalityEnhancement/SSD_Hierarchical

    def low_vpi(self, option, state) -> 'float, >= -0.001':
        """
        Calculates vpi for a given option. All nodes of branch in option set are important.
        Basically calculating vpi_action with goal node selected

        Arguments:
            option: option for computation
            state: state for computation
        """
        action = self.goals[option - 1][0]
        obs = (*self.subtree[action][0:], *self.path_to(action)[1:])
        obs = list(set(obs))
        op_dist = self.node_value_after_observe_option(option, 0, state, obs)
        node_idx = self.goals[option - 1][0]
        if not hasattr(state[node_idx], 'sample'):
            goal_dist = Categorical(vals=[state[node_idx]], probs=[1])
        else:
            goal_dist = state[node_idx]
        dists = [op_dist, goal_dist]
        nvao = float((cross_1(dists, sum)).expectation())
        result = nvao - self.expected_low_term_reward_disc(option, state)

        if abs(result) < 0.001:
            result = 0.0
        return result

Пример #11

0

Показать файл

    def vpi(self, state) -> 'float, >= -0.001':
        """
        Calculates vpi. All nodes of branch are important. Basically calculating vpi_action with goal node selected
        """
        option_dist = []
        for option in range(1, self.no_options+1):
            action = self.goals[option - 1][0]
            obs = (*self.subtree[action][0:], *self.path_to(action)[1:])
            obs = list(set(obs))
            op_dist = self.node_value_after_observe_option(option, state, obs)
            node_idx = self.goals[option-1][0]
            if not hasattr(state[node_idx], 'sample'):
                goal_dist = Categorical(vals=[state[node_idx]], probs= [1])
            else:
                goal_dist = state[node_idx]
            dists = [op_dist, goal_dist]

            option_dist.append(cross_1(dists, sum))

        net_dist = self.shrink(option_dist)
        nvao = float(cmax(net_dist, default=ZERO).expectation())
        # print("VPI Node observe value = {}".format(nvao))
        result = nvao - self.expected_term_reward_disc(state)
        if abs(result) < 0.001:
            result = 0.0
        return result

Пример #12

0

Показать файл

Файл: mouselab_metacontroller.py Проект: RationalityEnhancement/SSD_Hierarchical

    def get_goal_myopic_distribution(self, goal, n=4):
        """Finds the best click of the goal subtree through variance heuristic, then calculates the goal distribution based on the distribution of the best click and the expected reward of other nodes on the best path.

        Args:
            goal (int): Node index of the goal node
        """
        max_path = self.get_max_path(goal, include=True)
        max_variance_node = max(max_path, key=self.variance)
        reward = Categorical([0])
        for node in max_path:
            value = self._state[node]
            if node == max_variance_node:
                reward += value
            elif hasattr(value, "sample"):
                reward += Categorical([value.expectation()])
            else:
                reward += Categorical([value])
        return shrink_categorical(reward, n=n)

Пример #13

0

Показать файл

Файл: ac_algorithm.py Проект: heyitsden/Combining--experience-replay--with--exploration-by-random-network-distillation-

	def sample_actions(self):
		action_batch = []
		hot_action_batch = []
		for h,actor_head in enumerate(self.actor_batch):
			if is_continuous_control(self.policy_heads[h]['depth']):
				new_policy_batch = tf.transpose(actor_head, [1, 0, 2])
				sample_batch = Normal(new_policy_batch[0], new_policy_batch[1]).sample()
				action = tf.clip_by_value(sample_batch, -1,1)
				action_batch.append(action) # Sample action batch in forward direction, use old action in backward direction
				hot_action_batch.append(action)
			else: # discrete control
				distribution = Categorical(actor_head)
				action = distribution.sample(one_hot=False) # Sample action batch in forward direction, use old action in backward direction
				action_batch.append(action)
				hot_action_batch.append(distribution.get_sample_one_hot(action))
		# Give self esplicative name to output for easily retrieving it in frozen graph
		# tf.identity(action_batch, name="action")
		return action_batch, hot_action_batch

Пример #14

0

Показать файл

    def __init__(self, obs_space, action_space, base=None, base_kwargs=None):
        super(Policy, self).__init__()

        pixel_shape, non_pixel_obs, non_pixel_shape = parse_obs_space(
            obs_space)
        action_spaces, action_spaces_name = parse_action_space(action_space)
        # pixel:tuple (h,w,c),
        # non_pixel_obs = ['name']
        # non_pixel_shape :int = len(non_pixel_obs)
        # action_spaces: [2,2,1,1...]
        # action_spaces_name = ['attack',...]

        if base_kwargs is None:
            base_kwargs = {}
        base = Branch_CNNBase

        if non_pixel_shape == 0:
            add_non_pixel = False
        else:
            add_non_pixel = True

        # arguments
        non_pixel_layer = base_kwargs['non_pixel_layer']
        convs = base_kwargs['convs']
        in_channels = base_kwargs['frame_history_len'] * pixel_shape[2]
        in_feature = base_kwargs['in_feature']
        hidden_actions = base_kwargs['hidden_actions']
        hidden_value = base_kwargs['hidden_value']
        aggregator = base_kwargs['aggregator']

        self.num_branches = len(action_spaces)
        self.base = base(add_non_pixel, non_pixel_shape, non_pixel_layer,
                         convs, in_channels, in_feature, hidden_actions,
                         hidden_value, action_spaces, aggregator)

        self.dist_idxes = []
        dist_l = 1
        for i in range(self.num_branches):
            if (action_spaces[i] == 1):
                # continuous action space
                num_outputs = action_spaces[i]
                num_inputs = hidden_actions[-1]
                setattr(self, "dist" + str(dist_l),
                        DiagGaussian(num_inputs, num_outputs))
                self.dist_idxes.append(dist_l)
                dist_l += 1
            else:
                # discrete action space
                num_inputs = hidden_actions[-1]
                num_outputs = action_spaces[i]
                setattr(self, "dist" + str(dist_l),
                        Categorical(num_inputs, num_outputs))
                self.dist_idxes.append(dist_l)
                dist_l += 1

Пример #15

0

Показать файл

 def to_obs_tree(self, state, node, obs=()):
     """Updated obs tree computation for tree contraction method.
     """
     state = [
         state[n] if n in obs else expectation(state[n])
         for n in range(len(state))
     ]
     state = [
         node if hasattr(node, "sample") else Categorical([node])
         for node in state
     ]
     return tuple(state)

Пример #16

0

Показать файл

Файл: mouselab_metacontroller.py Проект: RationalityEnhancement/SSD_Hierarchical

def shrink_categorical(cat, n=4):
    '''
    Reduces the categorical distribution to distribution of size n using k-means clustering
    :param cat: categorical distribution
    :param n: number of bins/clusters to be reduced to
    :return:
    '''
    if (not hasattr(cat, "sample")) or (len(cat.vals) < n):
        return cat
    clusters, centroids = kmeans1d.cluster(cat.vals, n)
    probs = [0 for _ in range(n)]
    for cluster, prob in zip(clusters, cat.probs):
        probs[cluster] += prob
    return Categorical(centroids, probs=probs)

Пример #17

0

Показать файл

 def sample_actions(self):
     with tf.device(self.device):
         if self.is_continuous_control():
             new_policy_batch = tf.transpose(self.policy_batch, [1, 0, 2])
             sample_batch = Normal(new_policy_batch[0],
                                   new_policy_batch[1]).sample()
             action_batch = tf.clip_by_value(
                 sample_batch, -1, 1
             )  # Sample action batch in forward direction, use old action in backward direction
         else:  # discrete control
             action_batch = Categorical(self.policy_batch).sample(
             )  # Sample action batch in forward direction, use old action in backward direction
         # Give self esplicative name to output for easily retrieving it in frozen graph
         tf.identity(action_batch, name="action")
         return action_batch

Пример #18

0

Показать файл

Файл: mouselab_metacontroller.py Проект: RationalityEnhancement/SSD_Hierarchical

    def get_goal_reward(self, goal):
        """
        Returns the reward distribution for a goal state taking the best path to the goal into account.
        """

        max_path_reward = self.get_max_path_reward(goal)
        # Update the distribution by the value along the path
        goal_state = self._state[goal]
        if hasattr(goal_state, "sample"):
            if hasattr(goal_state, "mu") and hasattr(goal_state, "sigma"):
                return Normal(goal_state.mu + max_path_reward,
                              goal_state.sigma)
            elif hasattr(goal_state, "vals") and hasattr(goal_state, "probs"):
                vals = tuple(
                    [value + max_path_reward for value in goal_state.vals])
                return Categorical(vals, goal_state.probs)
            else:
                print(f"Type {type(goal_state)} not supported.")
                raise NotImplementedError()
        else:
            return goal_state + max_path_reward

Пример #19

0

Показать файл

Файл: mouselab_hierarichal_simple_VAR.py Проект: RationalityEnhancement/SSD_Hierarchical

    def high_myopic_voc(self, state, action, bins=4):
        """Returns the high level myopic VOC

        Arguments:
            state: high state for computation
            action: high level action for computation
            bins: number of bins to discretize continuous distribution
        """
        option_to_explore = action - (len(self.init) + self.no_options - 1)
        goal_clicked = self.goals[option_to_explore - 1][0]
        node = self.low_state[goal_clicked]
        if hasattr(node, 'sample'):
            if hasattr(node, 'mu'):
                dist = node.to_discrete(n=bins, max_sigma=4)
                dist.vals = tuple([(round(val, 3)) for val in dist.vals])
                dist.probs = tuple([(round(p, 3)) for p in dist.probs])
            else:
                dist = node
        else:
            dist = Categorical(vals=[node], probs=[1])

        r, p = zip(*dist)
        expected_return = 0
        high_state = []
        for op in range(1, self.no_options + 1):
            val, _ = self.low_term_reward(op, self.low_state)
            high_state.append(val)

        for k in range(
                len(p)
        ):  # Find best option to explore for each possible goal value
            state2 = list(self.low_state)
            state2[goal_clicked] = r[k]
            high_state[option_to_explore - 1], _ = self.high_belief_update(
                state2, option_to_explore)
            expected_return += p[k] * max(high_state)
        return expected_return - self.expected_high_term_reward(state)

Пример #20

0

Показать файл

 def _reward_prediction_loss(self):
     self.reward_prediction_labels = self._reward_prediction_target_placeholder(
         "reward_prediction_target", 1)
     return tf.reduce_sum(
         Categorical(self.reward_prediction_logits).cross_entropy(
             self.reward_prediction_labels))

Пример #21

0

Показать файл

def exact_node_value_after_observe(state, operations):
    """ Computes the categorical node value of the tree by applying the passed operations.

    Args:
        state ([Categorical]): Node value distribution based on the observation.
        operations ([(str, int, int)]): Operations to be applied.

    Returns:
        result (Categorical): Categorical node value of the final tree node. 
    """
    def reduce_add(state, i, j):
        new_state = [state[k] for k in range(len(state)) if k is not j]
        new_state[i] = new_state[i] + state[j]
        return new_state

    def reduce_mul(state, i, j):
        new_state = [state[k] for k in range(len(state)) if k is not j]
        new_state[i] = cross([state[i], state[j]], max)
        return new_state

    def split_obs(obs, x, num_parents):
        node_var: Categorical = obs[x]
        obs_values = []
        for v in zip(node_var.vals):
            obs_copy = [o for o in obs]
            obs_copy[x] = Categorical(v)
            # Copy obs to new nodes
            for i in range(num_parents - 1):
                obs_copy.append(Categorical(v))
            obs_values.append(obs_copy)
        probs = node_var.probs
        return probs, obs_values

    for i in range(len(operations)):
        op, a, b = operations[i]
        if op == "add":
            state = reduce_add(state, a, b)
        elif op == "mul":
            state = reduce_mul(state, a, b)
        elif op == "split":
            # a = split node, b = num parents
            probs, obs_vals = split_obs(state, a, b)
            states = []
            # Solve partial trees recursively
            for val in obs_vals:
                states.append(
                    exact_node_value_after_observe(val, operations[i + 1:]))
            # Combine partial responses
            total_p = []
            total_v = []
            for var_p, cat in zip(probs, states):
                assert len(cat) == 1
                p, v = cat[0].probs, cat[0].vals
                p = [x * var_p for x in p]
                total_p += p
                total_v += v
            state = [Categorical(total_v, probs=total_p)]
            break
        else:
            assert False
    assert len(state) == 1
    return state[0]

Пример #22

0

Показать файл

# Environment parameters
SWITCH_COST = 0  # Cost of switching goals
HIGH_COST = 10  # Cost of computing a goal
LOW_COST = 10  # Cost of computing a low level node
SEED = 0  # Fixes generated environments for training
COST_FUNC = "Basic"

TREE = [[1, 16, 31, 46], [2, 3, 4, 5], [6], [6], [7], [7], [8], [8],
        [9, 10, 11, 12], [13], [13], [14], [14], [15], [15], [],
        [17, 18, 19, 20], [21], [21], [22], [22], [23], [23], [24, 25, 26, 27],
        [28], [28], [29], [29], [30], [30], [], [32, 33, 34, 35], [36], [36],
        [37], [37], [38], [38], [39, 40, 41, 42], [43], [43], [44], [44], [45],
        [45], [], [47, 48, 49, 50], [51], [51], [52], [52], [53], [53],
        [54, 55, 56, 57], [58], [58], [59], [59], [60], [60], []]
d0 = Categorical([0])
dr = Categorical([-1500, 0], probs=[0.1, 0.9])
di = Categorical([-10, -5, 5, 10])
dg = Categorical([0, 25, 75, 100])
node_types = [
    di, d0, di, di, di, di, di, di, dr, di, di, di, di, di, di, dg, d0, di, di,
    di, di, di, di, dr, di, di, di, di, di, di, dg, d0, di, di, di, di, di, di,
    dr, di, di, di, di, di, di, dg, d0, di, di, di, di, di, di, dr, di, di, di,
    di, di, di, dg
]
INIT = tuple([r for r in node_types])

W = np.array([[0.45137647, 0.2288873, 9.26596405, 0.17091717, 2.24210099]])
high_risk_clicks = [8, 23, 38, 53]
goal_clicks = [15, 30, 45, 60]
term_click = 61

Python Categorical примеры использования