Exemplo n.º 1
0
    def run_evaluation(self):
        current_network = ChessNetwork(self.net_name)
        try:
            current_network.load(version='current')
        except ValueError:
            logger.fatal('Cannot evaluate a model without at least '
                         'a "current" version.')
            raise AssertionError('No current version of network.')

        nextgen_network = ChessNetwork(self.net_name)
        try:
            nextgen_network.load(version='nextgen', ckpt=self.ckpt)
        except ValueError:
            logger.warn('No nextgen version of this model - testing '
                        'play against blank slate version.')

        mcts_params = dotdict(n_sims=100,
                              c_base=4.0,
                              c_init=1.0,
                              eps=0.155,
                              resign_threshold=-0.85,
                              temperature=1,
                              use_noise=True)

        c_mcts = MCTS(current_network, params=mcts_params)
        ng_mcts = MCTS(nextgen_network, params=mcts_params)

        env = ChessEnvironment()

        agent_params = dotdict(temp_threshold=0, max_hmoves=50, n_book_moves=5)

        c_version = ChessAgent(c_mcts, env, params=agent_params)
        ng_version = ChessAgent(ng_mcts, env, params=agent_params)

        self.play_game(c_version, ng_version, env)
Exemplo n.º 2
0
    def perform_validation_step(self, task_index):
        """
        Perform validation steps for the task from index task_index.

        Args:
            task_index: task index

        Returns:
            (rewards, traces lengths)

        """
        validation_rewards = []
        traces_lengths = []
        for _ in range(self.num_validation_episodes):
            # Start new episode
            mcts = MCTS(self.policy, self.env, task_index,
                        **self.mcts_test_params)

            # Sample an execution trace with mcts using policy as a prior
            trace = mcts.sample_execution_trace()
            task_reward, trace_length, progs_failed_indices = trace[7], len(
                trace[3]), trace[10]

            validation_rewards.append(task_reward)
            traces_lengths.append(trace_length)
        return validation_rewards, traces_lengths, progs_failed_indices
Exemplo n.º 3
0
    def play_episode(self):
        # reset env
        obs = self.env.reset()
        env_state = self.env.get_state()

        done = False
        t = 0
        total_reward = 0.0

        mcts = MCTS(self.config)

        root_node = Node(state=env_state,
                         done=False,
                         obs=obs,
                         reward=0,
                         action=None,
                         parent=RootParentNode(env=self.env_creator()),
                         mcts=mcts,
                         depth=0)

        compute_action_times = []
        while not done:
            t += 1
            # compute action choice
            t0 = time.time()
            tree_policy, action, _, root_node = mcts.compute_action(root_node)
            root_node.parent = RootParentNode(env=self.env_creator())
            compute_action_times.append(time.time() - t0)

            # take action
            obs, reward, done, info = self.env.step(action)
            total_reward += reward

        avg_time = np.mean(compute_action_times)
        return t, total_reward, avg_time
Exemplo n.º 4
0
    def play_iteration(self, task_index, verbose=False):
        """
        Play one training iteration, i.e. select a task, play episodes, store experience in buffer and sample batches
        to perform gradient descent on policy weights.

        """

        # Get new task to attempt
        task_name = self.env.get_program_from_index(task_index)
        if self.verbose:
            print('Attempt task {} (length {})for {} episodes'.format(
                task_name, self.env.length, self.num_episodes_per_task))

        # Start training on the task
        for episode in range(self.num_episodes_per_task):
            if self.verbose:
                print('=> Episode: %d' % (episode))

            # Start new episode
            mcts = MCTS(self.policy, self.env, task_index,
                        **self.mcts_train_params)

            # Sample an execution trace with mcts using policy as a prior
            res = mcts.sample_execution_trace()
            observations, prog_indices, previous_actions_indices, policy_labels, lstm_states, _, _, \
                task_reward, clean_sub_execution, rewards, programs_failed_indices, \
                programs_failed_initstates = res

            if self.verbose:
                print("Task_reward:")
                print(task_reward)
                print("Rewards:")
                print(rewards)

            # record trace and store it in buffer only if no problem in sub-programs execution
            if clean_sub_execution:
                # Generates trace
                trace = list(
                    zip(observations, prog_indices, lstm_states, policy_labels,
                        rewards))
                # Append trace to buffer
                self.buffer.append_trace(trace)
            else:
                if self.verbose:
                    print("Trace has not been stored in buffer.")

                # Decrease statistics of programs that failed
                #for idx in programs_failed_indices:
                #self.curriculum_scheduler.update_statistics(idx, torch.FloatTensor([0.0]))

            # Train policy on batch
            if self.buffer.get_memory_length() > self.batch_size:
                for _ in range(self.num_updates_per_episode):
                    batch = self.buffer.sample_batch(self.batch_size)
                    if batch is not None:
                        self.policy.train_on_batch(batch)
            if verbose:
                print("Done episode {}/{}".format(episode + 1,
                                                  self.num_episodes_per_task))
Exemplo n.º 5
0
    def make_target(self, state_index: int, num_unroll_steps: int, td_steps: int, model=None, config=None):
        # The value target is the discounted root value of the search tree N steps into the future, plus
        # the discounted sum of all rewards until then.
        target_values, target_rewards, target_policies = [], [], []
        for current_index in range(state_index, state_index + num_unroll_steps + 1):
            bootstrap_index = current_index + td_steps
            if bootstrap_index < len(self.root_values):
                if model is None:
                    value = self.root_values[bootstrap_index] * self.discount ** td_steps
                else:
                    # Reference : Appendix H => Reanalyze
                    # Note : a target network  based on recent parameters is used to provide a fresher,
                    # stable n-step bootstrapped target for the value function
                    obs = self.obs(bootstrap_index)
                    obs = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)
                    network_output = model.initial_inference(obs)
                    value = network_output.value.data.cpu().item() * self.discount ** td_steps
            else:
                value = 0

            for i, reward in enumerate(self.rewards[current_index:bootstrap_index]):
                value += reward * self.discount ** i

            if current_index > 0 and current_index <= len(self.rewards):
                last_reward = self.rewards[current_index-1]
            else:
                last_reward = 0

            if current_index < len(self.root_values):
                target_values.append(value)
                target_rewards.append(last_reward)

                # Reference : Appendix H => Reanalyze
                # Note : MuZero Reanalyze revisits its past time-steps and re-executes its search using the
                # latest model parameters, potentially resulting in a better quality policy than the original search.
                # This fresh policy is used as the policy target for 80% of updates during MuZero training
                if model is not None and np.random.random() <= config.revisit_policy_search_rate:
                    from core.mcts import MCTS, Node
                    root = Node(0)
                    obs = self.obs(current_index)
                    obs = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)
                    network_output = model.initial_inference(obs)
                    root.expand(self.to_play(), self.legal_actions(), network_output)
                    MCTS(config).run(root, self.action_history(current_index), model)
                    self.store_search_stats(root, current_index)

                target_policies.append(self.child_visits[current_index])

            else:
                # States past the end of games are treated as absorbing states.
                target_values.append(0)
                target_rewards.append(last_reward)
                # Note: Target policy is  set to 0 so that no policy loss is calculated for them
                target_policies.append([0 for _ in range(len(self.child_visits[0]))])

        return target_values, target_rewards, target_policies
Exemplo n.º 6
0
    def run_selfplay(self):
        '''
        Executes a self-play task.

        Establishes an Agent in an Environment and lets the Agent
        play a full game of chess against itself.

        The resulting training examples are returned.
        '''
        network = ChessNetwork(name=self.net_name)
        try:
            network.load(version=self.version)
        except ValueError:
            pass
            
        env = ChessEnvironment()

        search_tree = MCTS(network)
        agent = ChessAgent(search_tree, env)
        exs = agent.play(game_name=f'{self.net_name}_game{self.iteration+1}', save=False)
        
        return exs
Exemplo n.º 7
0
            'number_of_simulations': conf.number_of_simulations_for_validation,
            'max_depth_dict': max_depth_dict,
            'temperature': conf.temperature,
            'c_puct': conf.c_puct,
            'exploit': True,
            'level_closeness_coeff': conf.level_closeness_coeff,
            'gamma': conf.gamma
        }

        for _ in range(40):

            env = ListEnv(length=len, encoding_dim=conf.encoding_dim)
            bubblesort_index = env.programs_library['BUBBLESORT']['index']

            # Test with mcts
            mcts = MCTS(policy, env, bubblesort_index, **mcts_test_params)
            res = mcts.sample_execution_trace()
            mcts_reward = res[7]
            mcts_rewards.append(mcts_reward)
            if mcts_reward > 0:
                mcts_rewards_normalized.append(1.0)
            else:
                mcts_rewards_normalized.append(0.0)

            # Test with network alone
            network_only = NetworkOnly(policy, env, max_depth_dict)
            netonly_reward, _ = network_only.play(bubblesort_index)
            network_only_rewards.append(netonly_reward)

        mcts_rewards_normalized_mean = np.mean(
            np.array(mcts_rewards_normalized))
Exemplo n.º 8
0
            env = QuickSortListEnv(length=len_, encoding_dim=conf.encoding_dim,
                                   expose_stack=expose_stack, without_partition_update=without_p_upd,
                                   sample_from_errors_prob=samp_err_poss, reduced_set=reduced_op_set,
                                   recursive_version=recursive_quicksort,
                                   expose_pointers_value=do_not_expose_pointer_values)

            try:
                operation_index = env.programs_library[args.operation]['index']
            except:
                print("The model analyzed does not have the operation ", args.operation)
                exit(0)


            # Test with mcts
            mcts = MCTS(policy, env, operation_index, **mcts_test_params)
            res = mcts.sample_execution_trace()
            mcts_reward = res[7]
            mcts_rewards.append(mcts_reward)
            if mcts_reward > 0:
                mcts_rewards_normalized.append(1.0)
            else:
                mcts_rewards_normalized.append(0.0)

            # Test with network alone
            network_only = NetworkOnly(policy, env, max_depth_dict)
            netonly_reward, _ = network_only.play(operation_index)
            network_only_rewards.append(netonly_reward)

        mcts_rewards_normalized_mean = np.mean(np.array(mcts_rewards_normalized))
        mcts_rewards_mean = np.mean(np.array(mcts_rewards))