Exemplo n.º 1
0
    def unity_inference(self):
        """
        inference mode. algorithm model will not be train, only used to show agents' behavior
        """
        if self.use_GCN:
            action = zeros_initializer(self.env.brain_num, 1)
            while True:
                ObsRewDone = self.env.reset()
                while True:
                    for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone):
                        action[i] = self.models[i].choose_action(
                            adj=_adj, x=_x, visual_s=_vs, evaluation=True)
                    actions = {
                        f'{brain_name}': action[i]
                        for i, brain_name in enumerate(self.env.brain_names)
                    }
                    ObsRewDone = self.env.step(vector_action=actions)
        else:

            action = zeros_initializer(self.env.brain_num, 1)
            while True:
                ObsRewDone = self.env.reset()
                while True:
                    for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
                        action[i] = self.models[i].choose_action(
                            s=_v, visual_s=_vs, evaluation=True)
                    actions = {
                        f'{brain_name}': action[i]
                        for i, brain_name in enumerate(self.env.brain_names)
                    }
                    ObsRewDone = self.env.step(vector_action=actions)
Exemplo n.º 2
0
    def unity_random_sample(self, steps):
        if self.use_GCN:
            adj, x, visual_state = zeros_initializer(self.env.brain_num, 3)

            ObsRewDone = self.env.reset()
            for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone):
                adj[i] = _adj
                x[i] = _x
                visual_state[i] = _vs

            for _ in range(steps):
                action = self.env.random_action()
                actions = {
                    f'{brain_name}': action[i]
                    for i, brain_name in enumerate(self.env.brain_names)
                }
                ObsRewDone = self.env.step(vector_action=actions)
                for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone):
                    self.models[i].store_data_gcn(adj=adj[i],
                                                  x=x[i],
                                                  visual_s=visual_state[i],
                                                  a=action[i],
                                                  r=_r,
                                                  adj_=_adj,
                                                  x_=_x,
                                                  visual_s_=_vs,
                                                  done=_d)
                    adj[i] = _adj
                    x[i] = _x
                    visual_state[i] = _vs
            self.pwi('Noise added complete.')
        else:
            state, visual_state = zeros_initializer(self.env.brain_num, 2)

            ObsRewDone = self.env.reset()
            for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
                state[i] = _v
                visual_state[i] = _vs

            for _ in range(steps):
                action = self.env.random_action()
                actions = {
                    f'{brain_name}': action[i]
                    for i, brain_name in enumerate(self.env.brain_names)
                }
                ObsRewDone = self.env.step(vector_action=actions)
                for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
                    self.models[i].store_data(s=state[i],
                                              visual_s=visual_state[i],
                                              a=action[i],
                                              r=_r,
                                              s_=_v,
                                              visual_s_=_vs,
                                              done=_d)
                    state[i] = _v
                    visual_state[i] = _vs
            self.pwi('Noise added complete.')
Exemplo n.º 3
0
    def unity_random_sample(self, steps):
        adj, x = zeros_initializer(self.env.brain_num, 2)

        ObsRewDone = self.env.reset()
        for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
            adj[i] = _v
            x[i] = _vs

        for _ in range(steps):
            action = self.env.random_action()
            actions = {
                f'{brain_name}': action[i]
                for i, brain_name in enumerate(self.env.brain_names)
            }
            ObsRewDone = self.env.step(vector_action=actions)
            for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
                self.models[i].store_data(s=adj[i],
                                          visual_s=x[i],
                                          a=action[i],
                                          r=_r,
                                          s_=_v,
                                          visual_s_=_vs,
                                          done=_d)
                adj[i] = _v
                x[i] = _vs
        self.pwi('Noise added complete.')
Exemplo n.º 4
0
def unity_random_sample(env, models, print_func, steps):
    state, visual_state = zeros_initializer(env.brain_num, 2)

    ObsRewDone = env.reset()
    for i, (_v, _vs, _r, _d, _info) in enumerate(ObsRewDone):
        state[i] = _v
        visual_state[i] = _vs

    for _ in range(steps):
        action = env.random_action()
        actions = {
            f'{brain_name}': action[i]
            for i, brain_name in enumerate(env.brain_names)
        }
        ObsRewDone = env.step(actions)
        for i, (_v, _vs, _r, _d, _info) in enumerate(ObsRewDone):
            models[i].store_data(s=state[i],
                                 visual_s=visual_state[i],
                                 a=action[i],
                                 r=_r,
                                 s_=_v,
                                 visual_s_=_vs,
                                 done=_d)
            state[i] = _v
            visual_state[i] = _vs
    print_func('Noise added complete.')
Exemplo n.º 5
0
Arquivo: unity.py Projeto: yyht/RLs
def ma_unity_no_op(env, models, buffer, print_func, pre_fill_steps, prefill_choose):
    assert isinstance(pre_fill_steps, int), 'multi-agent no_op.steps must have type of int'

    if pre_fill_steps < buffer.batch_size:
        pre_fill_steps = buffer.batch_size
    state, action, reward, next_state, dones = zeros_initializer(env.brain_num, 5)
    ObsRewDone = env.reset()
    for i, (_v, _vs, _r, _d, _info) in enumerate(ObsRewDone):
        state[i] = _v

    for i in range(env.brain_num):
        # initialize actions to zeros
        if env.is_continuous[i]:
            action[i] = np.zeros((env.brain_agents[i], env.a_dim[i][0]), dtype=np.int32)
        else:
            action[i] = np.zeros((env.brain_agents[i], 1), dtype=np.int32)

    a = [np.asarray(e) for e in zip(*action)]
    for _ in trange(pre_fill_steps, ncols=80, desc='Pre-filling', bar_format=bar_format):
        for i in range(env.brain_num):
            if prefill_choose:
                action[i] = models[i].choose_action(s=state[i])
        actions = {f'{brain_name}': action[i] for i, brain_name in enumerate(env.brain_names)}
        ObsRewDone = env.step(actions)
        for i, (_v, _vs, _r, _d, _info) in enumerate(ObsRewDone):
            reward[i] = _r[:, np.newaxis]
            next_state[i] = _vs
            dones[i] = _d[:, np.newaxis]

        def func(x): return [np.asarray(e) for e in zip(*x)]
        s, a, r, s_, done = map(func, [state, action, reward, next_state, dones])
        buffer.add(s, a, r, s_, done)
        for i in range(env.brain_num):
            state[i] = next_state[i]
Exemplo n.º 6
0
Arquivo: unity.py Projeto: yyht/RLs
def unity_no_op(env, models, print_func, pre_fill_steps, prefill_choose, real_done, desc='Pre-filling'):
    '''
    Interact with the environment but do not perform actions. Prepopulate the ReplayBuffer.
    Make sure steps is greater than n-step if using any n-step ReplayBuffer.
    '''
    assert isinstance(pre_fill_steps, int) and pre_fill_steps >= 0, 'no_op.steps must have type of int and larger than/equal 0'
    state, visual_state, action = zeros_initializer(env.brain_num, 3)

    [model.reset() for model in models]
    ObsRewDone = env.reset()
    for i, (_v, _vs, _r, _d, _info) in enumerate(ObsRewDone):
        state[i] = _v
        visual_state[i] = _vs

    for _ in trange(0, pre_fill_steps, min(env.brain_agents) + 1, unit_scale=min(env.brain_agents) + 1, ncols=80, desc=desc, bar_format=bar_format):
        if prefill_choose:
            for i in range(env.brain_num):
                action[i] = models[i].choose_action(s=state[i], visual_s=visual_state[i])
        else:
            action = env.random_action()
        actions = {f'{brain_name}': action[i] for i, brain_name in enumerate(env.brain_names)}
        ObsRewDone = env.step(actions)
        for i, (_v, _vs, _r, _d, _info) in enumerate(ObsRewDone):
            models[i].no_op_store(
                s=state[i],
                visual_s=visual_state[i],
                a=action[i],
                r=_r,
                s_=_v,
                visual_s_=_vs,
                done=_info['real_done'] if real_done else _d
            )
            models[i].partial_reset(_d)
            state[i] = _v
            visual_state[i] = _vs
Exemplo n.º 7
0
Arquivo: unity.py Projeto: yyht/RLs
def ma_unity_inference(env, models, episodes):
    """
    inference mode. algorithm model will not be train, only used to show agents' behavior
    """
    action = zeros_initializer(env.brain_num, 1)
    for episode in range(episodes):
        ObsRewDone = env.reset()
        while True:
            for i, (_v, _vs, _r, _d, _info) in enumerate(ObsRewDone):
                action[i] = models[i].choose_action(s=_v, evaluation=True)
            actions = {f'{brain_name}': action[i] for i, brain_name in enumerate(env.brain_names)}
            ObsRewDone = env.step(actions)
Exemplo n.º 8
0
    def ma_unity_no_op(self):
        steps = self.train_args['pre_fill_steps']
        choose = self.train_args['prefill_choose']
        assert isinstance(steps,
                          int), 'multi-agent no_op.steps must have type of int'

        if steps < self.ma_data.batch_size:
            steps = self.ma_data.batch_size
        state, action, reward, next_state, dones = zeros_initializer(
            self.env.brain_num, 5)
        ObsRewDone = self.env.reset(train_mode=False)
        for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
            state[i] = _v

        for i in range(self.env.brain_num):
            # initialize actions to zeros
            if self.env.is_continuous[i]:
                action[i] = np.zeros(
                    (self.env.brain_agents[i], self.env.a_dim_or_list[i][0]),
                    dtype=np.int32)
            else:
                action[i] = np.zeros(
                    (self.env.brain_agents[i], len(self.env.a_dim_or_list[i])),
                    dtype=np.int32)

        a = [np.asarray(e) for e in zip(*action)]
        for step in range(steps):
            self.pwi(f'no op step {step}')
            for i in range(self.env.brain_num):
                if choose:
                    action[i] = self.models[i].choose_action(s=state[i])
            actions = {
                f'{brain_name}': action[i]
                for i, brain_name in enumerate(self.env.brain_names)
            }
            ObsRewDone = self.env.step(vector_action=actions)
            for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
                reward[i] = _r[:, np.newaxis]
                next_state[i] = _vs
                dones[i] = _d[:, np.newaxis]

            def func(x):
                return [np.asarray(e) for e in zip(*x)]

            s, a, r, s_, done = map(func,
                                    [state, action, reward, next_state, dones])
            self.ma_data.add(s, a, r, s_, done)
            for i in range(self.env.brain_num):
                state[i] = next_state[i]
Exemplo n.º 9
0
def unity_no_op(env, models, print_func, pre_fill_steps, prefill_choose):
    '''
    Interact with the environment but do not perform actions. Prepopulate the ReplayBuffer.
    Make sure steps is greater than n-step if using any n-step ReplayBuffer.
    '''
    assert isinstance(
        pre_fill_steps, int
    ) and pre_fill_steps >= 0, 'no_op.steps must have type of int and larger than/equal 0'
    state, visual_state, action = zeros_initializer(env.brain_num, 3)

    [model.reset() for model in models]
    ObsRewDone = env.reset()
    for i, (_v, _vs, _, _) in enumerate(ObsRewDone):
        state[i] = _v
        visual_state[i] = _vs

    steps = pre_fill_steps // min(env.brain_agents) + 1

    for step in range(steps):
        print_func(f'no op step {step}')
        if prefill_choose:
            for i in range(env.brain_num):
                action[i] = models[i].choose_action(s=state[i],
                                                    visual_s=visual_state[i])
        else:
            action = env.random_action()
        actions = {
            f'{brain_name}': action[i]
            for i, brain_name in enumerate(env.brain_names)
        }
        ObsRewDone = env.step(actions)
        for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
            models[i].no_op_store(s=state[i],
                                  visual_s=visual_state[i],
                                  a=action[i],
                                  r=_r,
                                  s_=_v,
                                  visual_s_=_vs,
                                  done=_d)
            models[i].partial_reset(_d)
            state[i] = _v
            visual_state[i] = _vs
Exemplo n.º 10
0
    def unity_no_op(self):
        '''
        Interact with the environment but do not perform actions. Prepopulate the ReplayBuffer.
        Make sure steps is greater than n-step if using any n-step ReplayBuffer.
        '''
        steps = self.train_args['pre_fill_steps']
        choose = self.train_args['prefill_choose']
        assert isinstance(
            steps, int
        ) and steps >= 0, 'no_op.steps must have type of int and larger than/equal 0'

        state, visual_state, action = zeros_initializer(self.env.brain_num, 3)
        ObsRewDone = self.env.reset()
        for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
            state[i] = _v
            visual_state[i] = _vs

        steps = steps // min(self.env.brain_agents) + 1

        for step in range(steps):
            self.pwi(f'no op step {step}')
            if choose:
                for i in range(self.env.brain_num):
                    action[i] = self.models[i].choose_action(
                        s=state[i], visual_s=visual_state[i])
            else:
                action = self.env.random_action()
            actions = {
                f'{brain_name}': action[i]
                for i, brain_name in enumerate(self.env.brain_names)
            }
            ObsRewDone = self.env.step(vector_action=actions)
            for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
                self.models[i].no_op_store(s=state[i],
                                           visual_s=visual_state[i],
                                           a=action[i],
                                           r=_r,
                                           s_=_v,
                                           visual_s_=_vs,
                                           done=_d)
                state[i] = _v
                visual_state[i] = _vs
Exemplo n.º 11
0
def unity_inference(env, models):
    """
    inference mode. algorithm model will not be train, only used to show agents' behavior
    """
    action = zeros_initializer(env.brain_num, 1)

    while True:
        [model.reset() for model in models]
        ObsRewDone = env.reset()
        while True:
            for i, (_v, _vs, _r, _d, _info) in enumerate(ObsRewDone):
                action[i] = models[i].choose_action(s=_v,
                                                    visual_s=_vs,
                                                    evaluation=True)
                models[i].partial_reset(_d)
            actions = {
                f'{brain_name}': action[i]
                for i, brain_name in enumerate(env.brain_names)
            }
            ObsRewDone = env.step(actions)
Exemplo n.º 12
0
    def ma_unity_train(self):
        begin_episode = int(self.train_args['begin_episode'])
        save_frequency = int(self.train_args['save_frequency'])
        max_step = int(self.train_args['max_step'])
        max_episode = int(self.train_args['max_episode'])
        policy_mode = str(self.model_args['policy_mode'])
        assert policy_mode == 'off-policy', "multi-agents algorithms now support off-policy only."

        batch_size = self.ma_data.batch_size
        state, action, new_action, next_action, reward, next_state, dones, dones_flag, rewards = zeros_initializer(
            self.env.brain_num, 9)

        for episode in range(begin_episode, max_episode):
            ObsRewDone = self.env.reset()
            for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
                dones_flag[i] = np.zeros(self.env.brain_agents[i])
                rewards[i] = np.zeros(self.env.brain_agents[i])
                state[i] = _v
            step = 0
            last_done_step = -1
            while True:
                step += 1
                for i in range(self.env.brain_num):
                    action[i] = self.models[i].choose_action(s=state[i])
                actions = {
                    f'{brain_name}': action[i]
                    for i, brain_name in enumerate(self.env.brain_names)
                }
                ObsRewDone = self.env.step(vector_action=actions)

                for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
                    reward[i] = _r[:, np.newaxis]
                    next_state = _v
                    dones[i] = _d[:, np.newaxis]
                    unfinished_index = np.where(dones_flag[i] == False)[0]
                    dones_flag[i] += _d
                    rewards[i][unfinished_index] += _r[unfinished_index]

                def func(x):
                    return [np.asarray(e) for e in zip(*x)]

                s, a, r, s_, done = map(
                    func, [state, action, reward, next_state, dones])
                self.ma_data.add(s, a, r, s_, done)

                for i in range(self.env.brain_num):
                    state[i] = next_state[i]

                s, a, r, s_, done = self.ma_data.sample()
                for i, brain_name in enumerate(self.env.brain_names):
                    next_action[i] = self.models[i].get_target_action(s=s_[:,
                                                                           i])
                    new_action[i] = self.models[i].choose_action(
                        s=s[:, i], evaluation=True)
                a_ = np.asarray([np.asarray(e) for e in zip(*next_action)])
                if policy_mode == 'off-policy':
                    for i in range(self.env.brain_num):
                        self.models[i].learn(
                            episode=episode,
                            ap=np.asarray([
                                np.asarray(e) for e in zip(*next_action[:i])
                            ]).reshape(batch_size, -1) if i != 0 else np.zeros(
                                (batch_size, 0)),
                            al=np.asarray([
                                np.asarray(e) for e in zip(
                                    *next_action[-(self.env.brain_num - i -
                                                   1):])
                            ]).reshape(batch_size, -1)
                            if self.env.brain_num - i != 1 else np.zeros(
                                (batch_size, 0)),
                            ss=s.reshape(batch_size, -1),
                            ss_=s_.reshape(batch_size, -1),
                            aa=a.reshape(batch_size, -1),
                            aa_=a_.reshape(batch_size, -1),
                            s=s[:, i],
                            r=r[:, i])

                if all([all(dones_flag[i])
                        for i in range(self.env.brain_num)]):
                    if last_done_step == -1:
                        last_done_step = step
                    if policy_mode == 'off-policy':
                        break

                if step >= max_step:
                    break

            for i in range(self.env.brain_num):
                self.models[i].writer_summary(episode,
                                              total_reward=rewards[i].mean(),
                                              step=last_done_step)
            self.pwi('-' * 40)
            self.pwi(
                f'episode {episode:3d} | step {step:4d} last_done_step | {last_done_step:4d}'
            )
            if episode % save_frequency == 0:
                for i in range(self.env.brain_num):
                    self.models[i].save_checkpoint(episode)
Exemplo n.º 13
0
    def unity_train(self):
        """
        Train loop. Execute until episode reaches its maximum or press 'ctrl+c' artificially.
        Inputs:
            env:                    Environment for interaction.
            models:                 all models for this trianing task.
            save_frequency:         how often to save checkpoints.
            reset_config:           configuration to reset for Unity environment.
            max_step:               maximum number of steps for an episode.
            sampler_manager:        sampler configuration parameters for 'reset_config'.
            resampling_interval:    how often to resample parameters for env reset.
        Variables:
            brain_names:    a list of brain names set in Unity.
            state: store    a list of states for each brain. each item contain a list of states for each agents that controlled by the same brain.
            visual_state:   store a list of visual state information for each brain.
            action:         store a list of actions for each brain.
            dones_flag:     store a list of 'done' for each brain. use for judge whether an episode is finished for every agents.
            rewards:        use to record rewards of agents for each brain.
        """
        begin_episode = int(self.train_args['begin_episode'])
        save_frequency = int(self.train_args['save_frequency'])
        max_step = int(self.train_args['max_step'])
        max_episode = int(self.train_args['max_episode'])
        policy_mode = str(self.model_args['policy_mode'])
        moving_average_episode = int(self.train_args['moving_average_episode'])
        add_noise2buffer = bool(self.train_args['add_noise2buffer'])
        add_noise2buffer_episode_interval = int(
            self.train_args['add_noise2buffer_episode_interval'])
        add_noise2buffer_steps = int(self.train_args['add_noise2buffer_steps'])

        if self.use_GCN:
            adj, x, visual_state, action, dones_flag, rewards = zeros_initializer(
                self.env.brain_num, 6)
            sma = [
                SMA(moving_average_episode) for i in range(self.env.brain_num)
            ]

            for episode in range(begin_episode, max_episode):
                ObsRewDone = self.env.reset()
                for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone):
                    dones_flag[i] = np.zeros(self.env.brain_agents[i])
                    rewards[i] = np.zeros(self.env.brain_agents[i])
                    adj[i] = _adj
                    x[i] = _x
                    visual_state[i] = _vs
                step = 0
                last_done_step = -1
                while True:
                    step += 1
                    for i in range(self.env.brain_num):
                        action[i] = self.models[i].choose_action(
                            adj=adj[i], x=x[i], visual_s=visual_state[i])
                    actions = {
                        f'{brain_name}': action[i]
                        for i, brain_name in enumerate(self.env.brain_names)
                    }
                    ObsRewDone = self.env.step(vector_action=actions)

                    for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone):
                        unfinished_index = np.where(dones_flag[i] == False)[0]
                        dones_flag[i] += _d
                        self.models[i].store_data_gcn(adj=adj[i],
                                                      x=x[i],
                                                      visual_s=visual_state[i],
                                                      a=action[i],
                                                      r=_r,
                                                      adj_=_adj,
                                                      x_=_x,
                                                      visual_s_=_vs,
                                                      done=_d)
                        rewards[i][unfinished_index] += _r[unfinished_index]
                        adj[i] = _adj
                        x[i] = _x
                        visual_state[i] = _vs
                        if policy_mode == 'off-policy':
                            # print("advfdvsdfvfvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv")
                            self.models[i].learn(episode=episode, step=1)

                    if all([
                            all(dones_flag[i])
                            for i in range(self.env.brain_num)
                    ]):
                        if last_done_step == -1:
                            last_done_step = step
                        if policy_mode == 'off-policy':
                            break

                    if step >= max_step:
                        break

                for i in range(self.env.brain_num):
                    sma[i].update(rewards[i])
                    if policy_mode == 'on-policy':
                        self.models[i].learn(episode=episode, step=step)
                    self.models[i].writer_summary(
                        episode,
                        reward_mean=rewards[i].mean(),
                        reward_min=rewards[i].min(),
                        reward_max=rewards[i].max(),
                        step=last_done_step,
                        **sma[i].rs)
                self.pwi('-' * 40)
                self.pwi(
                    f'episode {episode:3d} | step {step:4d} | last_done_step {last_done_step:4d}'
                )
                for i in range(self.env.brain_num):
                    self.pwi(f'brain {i:2d} reward: {arrprint(rewards[i], 3)}')
                if episode % save_frequency == 0:
                    for i in range(self.env.brain_num):
                        self.models[i].save_checkpoint(episode)

                if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0:
                    self.unity_random_sample(steps=add_noise2buffer_steps)

        else:

            state, visual_state, action, dones_flag, rewards = zeros_initializer(
                self.env.brain_num, 5)
            sma = [
                SMA(moving_average_episode) for i in range(self.env.brain_num)
            ]

            for episode in range(begin_episode, max_episode):
                ObsRewDone = self.env.reset()
                for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
                    dones_flag[i] = np.zeros(self.env.brain_agents[i])
                    rewards[i] = np.zeros(self.env.brain_agents[i])
                    state[i] = _v
                    visual_state[i] = _vs
                step = 0
                last_done_step = -1
                while True:
                    step += 1
                    for i in range(self.env.brain_num):
                        action[i] = self.models[i].choose_action(
                            s=state[i], visual_s=visual_state[i])
                    actions = {
                        f'{brain_name}': action[i]
                        for i, brain_name in enumerate(self.env.brain_names)
                    }
                    ObsRewDone = self.env.step(vector_action=actions)

                    for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
                        unfinished_index = np.where(dones_flag[i] == False)[0]
                        dones_flag[i] += _d
                        self.models[i].store_data(s=state[i],
                                                  visual_s=visual_state[i],
                                                  a=action[i],
                                                  r=_r,
                                                  s_=_v,
                                                  visual_s_=_vs,
                                                  done=_d)
                        rewards[i][unfinished_index] += _r[unfinished_index]
                        state[i] = _v
                        visual_state[i] = _vs
                        if policy_mode == 'off-policy':
                            self.models[i].learn(episode=episode, step=1)

                    if all([
                            all(dones_flag[i])
                            for i in range(self.env.brain_num)
                    ]):
                        if last_done_step == -1:
                            last_done_step = step
                        if policy_mode == 'off-policy':
                            break

                    if step >= max_step:
                        break

                for i in range(self.env.brain_num):
                    sma[i].update(rewards[i])
                    if policy_mode == 'on-policy':
                        self.models[i].learn(episode=episode, step=step)
                    self.models[i].writer_summary(
                        episode,
                        reward_mean=rewards[i].mean(),
                        reward_min=rewards[i].min(),
                        reward_max=rewards[i].max(),
                        step=last_done_step,
                        **sma[i].rs)
                self.pwi('-' * 40)
                self.pwi(
                    f'episode {episode:3d} | step {step:4d} | last_done_step {last_done_step:4d}'
                )
                for i in range(self.env.brain_num):
                    self.pwi(f'brain {i:2d} reward: {arrprint(rewards[i], 3)}')
                if episode % save_frequency == 0:
                    for i in range(self.env.brain_num):
                        self.models[i].save_checkpoint(episode)

                if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0:
                    self.unity_random_sample(steps=add_noise2buffer_steps)
Exemplo n.º 14
0
def unity_train(env, models, print_func, begin_train_step, begin_frame_step,
                begin_episode, save_frequency, max_step_per_episode,
                max_train_episode, policy_mode, moving_average_episode,
                add_noise2buffer, add_noise2buffer_episode_interval,
                add_noise2buffer_steps, max_train_step, max_frame_step,
                real_done, off_policy_train_interval):
    """
    TODO: Annotation
    Train loop. Execute until episode reaches its maximum or press 'ctrl+c' artificially.
    Inputs:
        env:                    Environment for interaction.
        models:                 all models for this training task.
        save_frequency:         how often to save checkpoints.
        reset_config:           configuration to reset for Unity environment.
        max_step_per_episode:               maximum number of steps for an episode.
        sampler_manager:        sampler configuration parameters for 'reset_config'.
        resampling_interval:    how often to resample parameters for env reset.
    Variables:
        brain_names:    a list of brain names set in Unity.
        state: store    a list of states for each brain. each item contain a list of states for each agents that controlled by the same brain.
        visual_state:   store a list of visual state information for each brain.
        action:         store a list of actions for each brain.
        dones_flag:     store a list of 'done' for each brain. use for judge whether an episode is finished for every agents.
        rewards:        use to record rewards of agents for each brain.
    """

    state, visual_state, action, dones_flag, rewards = zeros_initializer(
        env.brain_num, 5)
    sma = [SMA(moving_average_episode) for i in range(env.brain_num)]
    frame_step = begin_frame_step
    min_of_all_agents = min(env.brain_agents)
    train_step = [begin_train_step for _ in range(env.brain_num)]

    for episode in range(begin_episode, max_train_episode):
        [model.reset() for model in models]
        ObsRewDone = env.reset()
        for i, (_v, _vs, _r, _d, _info) in enumerate(ObsRewDone):
            dones_flag[i] = np.zeros(env.brain_agents[i])
            rewards[i] = np.zeros(env.brain_agents[i])
            state[i] = _v
            visual_state[i] = _vs
        step = 0
        last_done_step = -1
        while True:
            step += 1
            for i in range(env.brain_num):
                action[i] = models[i].choose_action(s=state[i],
                                                    visual_s=visual_state[i])
            actions = {
                f'{brain_name}': action[i]
                for i, brain_name in enumerate(env.brain_names)
            }
            ObsRewDone = env.step(actions)

            for i, (_v, _vs, _r, _d, _info) in enumerate(ObsRewDone):
                unfinished_index = np.where(dones_flag[i] == False)[0]
                dones_flag[i] += _d
                models[i].store_data(
                    s=state[i],
                    visual_s=visual_state[i],
                    a=action[i],
                    r=_r,
                    s_=_v,
                    visual_s_=_vs,
                    done=_info['real_done'] if real_done else _d)
                models[i].partial_reset(_d)
                rewards[i][unfinished_index] += _r[unfinished_index]
                state[i] = _v
                visual_state[i] = _vs
                if policy_mode == 'off-policy':
                    if train_step[i] % off_policy_train_interval == 0:
                        models[i].learn(episode=episode, train_step=train_step)
                    train_step[i] += 1
                    if train_step[i] % save_frequency == 0:
                        models[i].save_checkpoint(train_step=train_step[i],
                                                  episode=episode,
                                                  frame_step=frame_step)

            frame_step += min_of_all_agents
            if 0 < max_train_step < min(
                    train_step) or 0 < max_frame_step < frame_step:
                for i in range(env.brain_num):
                    models[i].save_checkpoint(train_step=train_step[i],
                                              episode=episode,
                                              frame_step=frame_step)
                logger.info(
                    f'End Training, learn step: {train_step}, frame_step: {frame_step}'
                )
                return

            if all([all(dones_flag[i]) for i in range(env.brain_num)]):
                if last_done_step == -1:
                    last_done_step = step
                if policy_mode == 'off-policy':
                    break

            if step >= max_step_per_episode:
                break

        for i in range(env.brain_num):
            sma[i].update(rewards[i])
            if policy_mode == 'on-policy':
                models[i].learn(episode=episode, train_step=train_step)
                train_step[i] += 1
                if train_step[i] % save_frequency == 0:
                    models[i].save_checkpoint(train_step=train_step[i],
                                              episode=episode,
                                              frame_step=frame_step)
            models[i].writer_summary(episode,
                                     reward_mean=rewards[i].mean(),
                                     reward_min=rewards[i].min(),
                                     reward_max=rewards[i].max(),
                                     step=last_done_step,
                                     **sma[i].rs)
        print_func('-' * 40, out_time=True)
        print_func(
            f'episode {episode:3d} | step {step:4d} | last_done_step {last_done_step:4d}'
        )
        for i, bn in enumerate(env.brain_names):
            print_func(f'{bn} reward: {arrprint(rewards[i], 2)}')

        if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0:
            unity_no_op(env,
                        models,
                        print_func=print_func,
                        pre_fill_steps=add_noise2buffer_steps,
                        prefill_choose=False,
                        real_done=real_done,
                        desc='adding noise')