示例#1
0
    def __init__(self, cfg):
        self.cfg = cfg

        self.inds = None

        self.beta = LinearSchedule(cfg['replay_beta_iters'],
                                   initial_p=cfg['replay_beta_base'],
                                   final_p=cfg['replay_beta_top'])

        self.counter = 0
        self.timetable = [0] * cfg['replay_size']
        self.delta = cfg['freeze_count'] + cfg['freeze_delta']

        self.mem = Memory(cfg['replay_size'], cfg['select_count'],
                          cfg['replay_alpha'])
示例#2
0
    def __init__(self, n_step, replay_reanalyze, buffer_size, select_count,
                 max_ep_draw_count, alpha, beta_base, beta_end, beta_horizon,
                 recalc_delta):
        self.batch_count = 0
        self.inds = None

        self.beta = LinearSchedule(beta_horizon, beta_base, beta_end)

        self.counter = 0
        self.ts = 0
        self.timetable = [0] * buffer_size

        self.n_step = n_step
        self.max_ep_draw_count = max_ep_draw_count
        self.delta = recalc_delta

        self.replay_reanalyze = replay_reanalyze
        self.buffer_size = buffer_size

        self.mem = Memory(buffer_size, select_count, alpha)
示例#3
0
    def __init__(self, bot_id, cfg, task_factory, encoder, Actor, Critic, goal_encoder):
        self.cfg = cfg
        self.bot = Bot(
                cfg,
                bot_id,
                encoder,
                goal_encoder,
                Actor,
                Critic,
                task_factory.state_size,
                task_factory.action_size,
                task_factory.wrap_action,
                task_factory.wrap_value)
        self.bot.share_memory() # !! must be done from main process !!

        self.iter = 0
        self.freezed = 0

        self.counter = 1
        self.tau = LinearSchedule(cfg['tau_replay_counter'], cfg['tau_base'], cfg['tau_final'])

        self.lock = Lock()
        self.bot = BotProxy(self.lock, cfg, self.bot, cfg['device'])
示例#4
0
    def __init__(self, cfg, bot, objective_id, task_factory, update_goal):
        assert not cfg['gae'] or cfg[
            'n_step'] == 1, "gae is currently enabled only with one step lookahead!"

        self.cfg = cfg
        self.objective_id = objective_id

        self.bot = bot
        self.update_goal = update_goal

        self.stop = False

        self.debug_out_ex = "y" * 10

        self.n_step = self.cfg['n_step']
        self.discount = self.cfg['discount_rate']
        self.n_discount = 1. if self.cfg['gae'] else (self.discount**
                                                      self.n_step)
        self.batch_size = self.cfg['batch_size']

        self.counter = 0
        self.tau = LinearSchedule(cfg['tau_replay_counter'],
                                  initial_p=self.cfg['tau_base'],
                                  final_p=cfg['tau_final'])

        self.replay = task_factory.make_replay_buffer(cfg)

        self.full_episode = []
        self.last_train_cap = self.cfg['critic_learn_delta']

        # here imho configurable choise : use curiosity, td errors, random, or another method
        self.curiosity = CuriosityPrio(task_factory.state_size,
                                       task_factory.action_size,
                                       task_factory.action_range,
                                       task_factory.wrap_action, cfg['device'],
                                       cfg)
示例#5
0
class ReplayBuffer:
    def __init__(self, n_step, replay_reanalyze, buffer_size, select_count,
                 max_ep_draw_count, alpha, beta_base, beta_end, beta_horizon,
                 recalc_delta):
        self.batch_count = 0
        self.inds = None

        self.beta = LinearSchedule(beta_horizon, beta_base, beta_end)

        self.counter = 0
        self.ts = 0
        self.timetable = [0] * buffer_size

        self.n_step = n_step
        self.max_ep_draw_count = max_ep_draw_count
        self.delta = recalc_delta

        self.replay_reanalyze = replay_reanalyze
        self.buffer_size = buffer_size

        self.mem = Memory(buffer_size, select_count, alpha)

    def sample(self, batch_size, critic):
        self.inds, data = zip(*self._sample(batch_size, critic))
        # lol TODO : kick off numpy vstack transpose
        data = np.vstack(data)
        return (data.T)

    def add(self, batch, prios, hashkey):
        if len(prios) < self.n_step * 2:
            return
        if not self._worth_experience(prios):
            return

        self.ts = (self.ts + 1) % len(self.timetable)
        # do first update when we do first freeze
        self.timetable[self.ts] = self.counter - self.delta
        for i, data in enumerate(batch):
            self.mem.add(
                [np.asarray(data), i,
                 len(prios) - i - 1, hashkey, self.ts], prios[i])

    def _worth_experience(self, prios):
        #        return True
        if not len(self):
            return True
        if len(self) < self.buffer_size:
            return True
        for _ in range(10):
            data = self.mem.select(1.)
            if None == data:
                return True
            _, w, _ = data
            status = prios.mean() > np.mean(w)
            if status:
                return True
        return 0 == random.randint(0, 4)

    def _sample(self, batch_size, critic):
        """
        sampling should be multithreaded ~ mainly recalc
        """
        self.counter += 1
        self.batch_count = 0
        while self.batch_count < batch_size:
            data = self.mem.select(self.beta.value())
            if None == data:
                continue
            batch, _, inds = data
            if None == batch:
                continue
            _, local_forward, local_backward, hashkey, timestamp = zip(*batch)

            uniq = set(
                map(lambda i_b: i_b[0] - i_b[1], zip(inds, local_forward)))
            for i, b, f, k, t in zip(inds, local_backward, local_forward,
                                     hashkey, timestamp):
                pivot = i - f
                if pivot < 0 or pivot + b + f > len(self):
                    continue  # temporarely we want to avoid this corner case .. TODO
                if pivot not in uniq:
                    continue
                if 0 != self.mem.tree.data[pivot][1]:
                    continue
                assert pivot + self.mem.tree.data[pivot][
                    2] == i + b, "--> {} {} :: {} {}".format(
                        pivot, [(x[1], x[2])
                                for x in self.mem.tree.data[pivot:i]], i, b)
                uniq.remove(pivot)
                bc = self.batch_count
                data = zip(*self._do_sample_wrap(pivot, b + f, critic, k, t))
                if bc == self.batch_count:
                    continue
                yield data

    def _do_sample_wrap(self, pivot, length, critic, hashkey, timestamp):
        return self._do_sample(self.mem.tree.data[pivot:pivot + length], pivot,
                               length, critic, hashkey, timestamp)

# TODO :: REFACTOR! too complex logic, from replasy-reanal to recalc updates n timetable ..

    def _do_sample(self, full_episode, pivot, length, critic, hashkey,
                   timestamp):
        available_range = range(length - self.n_step)

        top = min(len(available_range), self.max_ep_draw_count)
        replay = random.sample(available_range, random.randint(1, top))

        recalc = abs(self.timetable[timestamp] - self.counter) >= self.delta

        if not critic or not self.replay_reanalyze:
            episode_approved = map(lambda i: (full_episode[i][0], i in replay),
                                   range(length))
        else:
            episode_approved = critic.reanalyze_experience(
                full_episode, replay, recalc, -1 == timestamp)

        if recalc:
            self.timetable[timestamp] = self.counter

        for i, step_good in enumerate(episode_approved):
            step, good = step_good
            if recalc and -1 != timestamp and self.replay_reanalyze:
                self.mem.tree.data[pivot + i][0][...] = np.asarray(step)

            if i not in replay:
                continue
            if not good:
                continue
            self.batch_count += 1
            yield pivot + i, step

    def update(self, prios):
        '''
        replay buffer must be single thread style access, or properly locked ...
          ( sample, update, add )
          well in theory as it is not expanding, we dont care much of reads only .. for now lol ..
        '''
        self.mem.priority_update(np.hstack(self.inds), prios)
        self.inds = None

    def __len__(self):
        return len(self.mem)
示例#6
0
class Critic:
    def __init__(self, cfg, bot, objective_id, task_factory, update_goal):
        assert not cfg['gae'] or cfg[
            'n_step'] == 1, "gae is currently enabled only with one step lookahead!"

        self.cfg = cfg
        self.objective_id = objective_id

        self.bot = bot
        self.update_goal = update_goal

        self.stop = False

        self.debug_out_ex = "y" * 10

        self.n_step = self.cfg['n_step']
        self.discount = self.cfg['discount_rate']
        self.n_discount = 1. if self.cfg['gae'] else (self.discount**
                                                      self.n_step)
        self.batch_size = self.cfg['batch_size']

        self.counter = 0
        self.tau = LinearSchedule(cfg['tau_replay_counter'],
                                  initial_p=self.cfg['tau_base'],
                                  final_p=cfg['tau_final'])

        self.replay = task_factory.make_replay_buffer(cfg)

        self.full_episode = []
        self.last_train_cap = self.cfg['critic_learn_delta']

        # here imho configurable choise : use curiosity, td errors, random, or another method
        self.curiosity = CuriosityPrio(task_factory.state_size,
                                       task_factory.action_size,
                                       task_factory.action_range,
                                       task_factory.wrap_action, cfg['device'],
                                       cfg)

    def training_loop(self, ping, sync, share_gate, loss_gate, stats):
        while True:
            exp = share_gate.get()
            if None == exp:
                break

            full, action, exp = exp
            if not full:
                self._inject(action, exp)
            else:
                self._train(loss_gate, ping, stats, action, exp)

            if not self.cfg['critic_learn_delta']:
                continue
            if len(self.full_episode) < self.last_train_cap:
                continue

            self.last_train_cap += self.cfg['critic_learn_delta']

            #            print("\n%s\nDO FAST TRAIN : %i\n%s\n"%('*' * 60, len(self.full_episode), '*' * 60))
            ping.put(
                True)  # old style scoping ... python nicer way out there ?
            for batch in self._do_sampling():
                self._eval(loss_gate, stats, batch)
            ping.get()

        self._dtor(ping, sync, stats)

    def _dtor(self, ping, sync, stats):
        self.stop = True
        while not ping.empty():
            time.sleep(.1)
        while not stats.empty():
            stats.get()
        sync.put(True)

    def _inject(self, action, exp):
        goals, states, features, actions, probs, rewards, n_goals, n_states, n_features, good = exp
        if not len(states):
            return

        n_rewards = policy.td_lambda(
            rewards, self.n_step,
            self.discount) if not self.cfg['gae'] else policy.gae(
                rewards,
                self.bot.qa_future(
                    self.objective_id,
                    np.vstack([goals, [goals[-1]]]).reshape(
                        len(goals) + 1, -1), np.vstack([states, n_states[-1]]),
                    np.vstack([features, [n_features[-1]]]),
                    np.vstack([actions, [action]])),
                self.discount,
                self.cfg['gae_tau'],
                stochastic=False)

        full_episode = np.vstack(
            zip(*[
                goals, states, features, actions, probs, rewards, n_goals,
                n_states, n_features, n_rewards, good
            ]))

        if not len(self.full_episode):
            self.full_episode = full_episode
        else:
            self.full_episode = np.vstack([self.full_episode, full_episode])

    def _train(self, loss_gate, ping, stats, action, exp):
        self._inject(action, exp)

        self._update_memory()

        self._self_play(loss_gate, ping, stats)
        # abandoned reinforce clip, as i think that is no go for AGI...

        #        print("\n%s\nFULL EPISODE LENGTH : %i\n%s\n"%('*' * 60, len(self.full_episode), '*' * 60))
        self.full_episode = []
        self.last_train_cap = self.cfg['critic_learn_delta']

    def _self_play(self, loss_gate, ping, stats):
        ping.put(True)
        for _ in range(self.cfg['full_replay_count']):
            samples = self._select()
            if None == samples:
                continue
            self._eval(loss_gate, stats, samples.T)
        ping.get()

    def _update_memory(self):
        goals, states, features, actions, probs, rewards, n_goals, n_states, n_features, n_rewards, good = self.full_episode.T
        goals, states, n_goals, n_states, actions = np.vstack(
            goals), np.vstack(states), np.vstack(n_goals), np.vstack(
                n_states), np.vstack(actions)

        prios = self.curiosity.weight(states, n_states, actions)

        self.replay.add(
            map(
                lambda i: (goals[i], states[i], features[i], actions[i], probs[
                    i], rewards[i], n_goals[i], n_states[i], n_features[i],
                           n_rewards[i]),
                filter(lambda i: bool(sum(good[i:i + self.cfg['good_reach']])),
                       range(len(states)))), prios, hash(states.tostring()))

        self.curiosity.update(states, n_states, actions)

    def _eval(self, loss_gate, stats, args):
        if self.stop:
            return

        goals, states, features, actions, probs, n_goals, n_states, n_features, n_rewards = args

        assert len(n_features) == len(features), "features missmatch"
        if len(n_features) != len(features):
            return

        goals, states, features, actions = np.vstack(goals), np.vstack(
            states), np.vstack(features), np.vstack(actions)
        n_goals, n_states, n_features, n_rewards = np.vstack(
            n_goals), np.vstack(n_states), np.vstack(n_features), np.vstack(
                n_rewards)

        # func approximators; self play
        n_qa = self.bot.q_future(self.objective_id, n_goals, n_states,
                                 n_features)
        # n_step target + bellman equation
        td_targets = n_rewards + self.n_discount * n_qa
        # learn !!
        self.counter += 1
        self.bot.learn_critic(
            self.objective_id, goals, states, features, actions, td_targets,
            self.tau.value() *
            (0 == self.counter % self.cfg['critic_update_delay']))

        # propagate back to simulation ~ debug purposes
        if None != stats and self.cfg['dbgout'] and not self.stop:
            stats.put("[ TARGET:{:2f} replay::{} ]<----".format(
                td_targets[-1].item(), len(self.replay)))

# propagate back to main process
        loss_gate.put([goals, states, features, actions, probs, td_targets])

        # WARNING : EXPERIMENT ~~> here we on purpose provide same features as for n-state
        # basically we are leaking future of that trajectory, what our agent will do ?
        # bellman will be probably not proud of me at this point :)
        #loss_gate.put([ goals, states, n_features, actions, probs, td_targets ])

    def _population(self, batch):
        return random.sample(
            range(len(batch)),
            random.randint(1, min(2 * self.cfg['batch_size'],
                                  len(batch) - 1)))

    def _do_sampling(self):
        if self.stop:
            return
        batch = self._fast_exp()
        if None == batch:
            return

#        first_order_experience_focus = '''
        for _ in range(self.cfg['fast_exp_epochs']):
            samples = self._select()
            mini_batch = batch if None == samples else np.vstack(
                [batch, samples])
            population = self._population(mini_batch)
            yield mini_batch[population].T

        replay_focused = '''
        for _ in range(self.cfg['fast_exp_epochs']):
            population = self._population(batch)
            samples = self._select()
            if None != samples:
                yield np.vstack([batch[population], samples]).T
            else:
                yield batch[population].T
#        '''
        population = self._population(batch)
        yield batch[population].T  # push towards latest experience

    def _fast_exp(self):
        if max(len(self.replay), len(self.full_episode)) < self.batch_size:
            return None

        goals, states, features, actions, probs, _, n_goals, n_states, n_features, n_rewards, _ = self.full_episode.T
        return np.vstack(
            zip(goals, states, features, actions, probs, n_goals, n_states,
                n_features, n_rewards))

    def _select(self):
        if len(self.replay) < self.batch_size:
            return None

        data = self.replay.sample(self.batch_size, self)
        if None == data:
            return None

        goals, states, features, actions, probs, _, n_goals, n_states, n_features, n_rewards = data
        if not len(actions):
            return None

        self._update_replay_prios(states, n_states, actions)

        return np.vstack(
            zip(goals, states, features, actions, probs, n_goals, n_states,
                n_features, n_rewards))

    def _update_replay_prios(self, states, n_states, actions):
        if not self.cfg['replay_cleaning']:
            return
        states, n_states, actions = np.vstack(states), np.vstack(
            n_states), np.vstack(actions)
        prios = self.curiosity.weight(states, n_states, actions)
        # seems we are bit too far for PG ( PPO ) to do something good, replay buffer should abandon those
        prios[
            self.cfg['prob_treshold'] < np.abs(np.vstack(probs).mean(-1))] = 0
        self.replay.update(prios)

# main bottleneck of whole solution, but we experimenting so .. :)
# also i think can be played with, when enough hardware/resources
#  -> properly scale, and do thinks on background in paralell..
# + if main concern is speed i would not do it in python in first place ..

    def reanalyze_experience(self, episode, indices, recalc):
        # imho i iterate too much trough episode ... better to implement it in one sweep ... TODO
        goals, states, f, a, p = zip(
            *[[e[0][0], e[0][1], e[0][2], e[0][3], e[0][4]] for e in episode])

        goals, states = np.asarray(goals), np.asarray(states)

        if recalc:
            f, p = self.bot.reevaluate(self.objective_id, goals, states, a)

        r, g, s, n_g, n_s = zip(*self.update_goal(*zip(*[
            (  # magic *
                e[0][5],  # rewards .. just so he can forward it to us back
                e[0][0],  # goals ..
                e[0][1],  # states ..
                e[0][6],  # n_goals ..
                e[0][7],  # n_states ..
                #                e[0][2], # action .. well for now no need, however some emulator may need them
                bool(random.randint(0, self.cfg['her_max_ratio'])
                     ),  # update or not
            ) for e in episode
        ])))

        n = [e[0][9] for e in episode
             ] if not recalc or not self.cfg['gae'] else policy.gae(
                 r,
                 self.bot.qa_future(self.objective_id, goals, states,
                                    np.asarray(f), np.asarray(a)),
                 self.discount, self.cfg['gae_tau'])

        for i in indices:
            yield (g[i], s[i], f[i], a[i], p[i], r[i], n_g[i], n_s[i],
                   f[(i + self.n_step) if i + self.n_step < len(f) else -1],
                   n[i])
示例#7
0
    def __init__(
        self,
        brain,
        replay_buffer,
        update_goal,  # HER / RNN / GAE essential for ReplayBuffer
        n_groups,
        n_step,
        floating_step,
        gamma,
        good_reach,
        sync_delta,
        learning_delay,
        learning_repeat,
        batch_size,  # immidiate learning
        fresh_frac,
        optim_epochs,
        replay_cleaning,
        prob_treshold,  # PPO
        her_max_ratio,
        gae,
        gae_tau,
        tau_replay_counter,
        tau_base,
        tau_final,
        freeze_delta,
        freeze_count,
    ):
        # TODO : add to config ...
        self.fresh_frac = fresh_frac
        self.optim_epochs = optim_epochs
        self.floating_step = floating_step

        self.brain = brain
        self.update_goal = update_goal

        # encoders freezing!
        self.freeze_delta = freeze_delta
        self.freeze_count = freeze_count
        self.iter = 0
        self.freezed = 0

        self.n_groups = n_groups

        self.n_step = n_step
        self.gamma = gamma
        self.good_reach = good_reach
        self.learning_delay = learning_delay
        self.sync_delta = sync_delta
        self.learning_repeat = learning_repeat
        self.batch_size = batch_size
        self.replay_cleaning = replay_cleaning
        self.prob_treshold = prob_treshold
        self.her_max_ratio = her_max_ratio
        self.gae = gae
        self.gae_tau = gae_tau

        self.replay_buffer = replay_buffer

        self.tau = LinearSchedule(tau_replay_counter,
                                  initial_p=tau_base,
                                  final_p=tau_final)

        self.counter = 0
        self.reset()

        self.steps = 0
        self.last_train_cap = self.learning_delay
示例#8
0
class Agent:
    def __init__(
        self,
        brain,
        replay_buffer,
        update_goal,  # HER / RNN / GAE essential for ReplayBuffer
        n_groups,
        n_step,
        floating_step,
        gamma,
        good_reach,
        sync_delta,
        learning_delay,
        learning_repeat,
        batch_size,  # immidiate learning
        fresh_frac,
        optim_epochs,
        replay_cleaning,
        prob_treshold,  # PPO
        her_max_ratio,
        gae,
        gae_tau,
        tau_replay_counter,
        tau_base,
        tau_final,
        freeze_delta,
        freeze_count,
    ):
        # TODO : add to config ...
        self.fresh_frac = fresh_frac
        self.optim_epochs = optim_epochs
        self.floating_step = floating_step

        self.brain = brain
        self.update_goal = update_goal

        # encoders freezing!
        self.freeze_delta = freeze_delta
        self.freeze_count = freeze_count
        self.iter = 0
        self.freezed = 0

        self.n_groups = n_groups

        self.n_step = n_step
        self.gamma = gamma
        self.good_reach = good_reach
        self.learning_delay = learning_delay
        self.sync_delta = sync_delta
        self.learning_repeat = learning_repeat
        self.batch_size = batch_size
        self.replay_cleaning = replay_cleaning
        self.prob_treshold = prob_treshold
        self.her_max_ratio = her_max_ratio
        self.gae = gae
        self.gae_tau = gae_tau

        self.replay_buffer = replay_buffer

        self.tau = LinearSchedule(tau_replay_counter,
                                  initial_p=tau_base,
                                  final_p=tau_final)

        self.counter = 0
        self.reset()

        self.steps = 0
        self.last_train_cap = self.learning_delay

    def reset(self):
        (self.goals, self.states, self.features, self.actions, self.probs,
         self.rewards, self.n_goals, self.n_states, self.n_features,
         self.credits, self.discounts, self.goods,
         self.n_steps) = ([], [], [], [], [], [], [], [], [], [], [], [], [])

    def training_loop(self, share_gate):
        while True:
            exp = share_gate.get()
            if None == exp:
                break
            self.train(exp)

    def train(self, exp):
        finished, exp = exp
        if not finished:
            self._inject(False, exp)
        else:
            self._finish(exp)

    def _try_learn(self):
        if self.steps < self.last_train_cap:
            return

        self.last_train_cap += self.learning_delay

        for batch in self._do_sampling():
            self.counter += 1
            for _ in range(self.optim_epochs):
                self.brain.learn(
                    batch,
                    self.tau.value() if
                    (0 == self.counter % self.sync_delta) else 0)

    def _finish(self, exp):
        self._inject(True, exp)
        self._update_memory()
        self.reset()

    def _regroup_for_memory(self):
        g = lambda data: data.reshape(len(data), self.n_groups, -1)
        return (g(self.goals), g(self.states), g(self.features),
                g(self.actions), g(self.probs), g(self.rewards),
                g(self.n_goals), g(self.n_states), g(self.n_features),
                g(self.credits), self.discounts.reshape(-1, 1), g(self.goods))

    def _update_memory(self):
        batch = self._regroup_for_replay()
        prios = self.brain.novelity(batch)

        assert len(self.n_steps) == len(self.discounts) - self.n_step
        g, s, f, a, p, r, n_g, n_s, n_f, n_r, n_d, good = self._regroup_for_memory(
        )
        prios = np.reshape(prios,
                           [len(g), self.n_groups, -1]).mean(2)  # per group
        for i in range(g.shape[1]):
            self.replay_buffer.add(
                map(
                    lambda j: (g[j, i], s[j, i], f[j, i], a[j, i], p[j, i], r[
                        j, i], n_g[j, i], n_s[j, i], n_f[j, i], n_r[j, i], n_d[
                            j], self.n_steps[j]
                               if j < len(n_d) - self.n_step else None,
                               bool((j < len(s) - self.n_step) and good[
                                   j:j + self.good_reach, i].sum())),
                    range(len(s))), prios[:, i], hash(s[:i:].tostring()))

    def _population(self, batch, limit):
        return random.sample(range(len(batch)),
                             random.randint(1, min(limit,
                                                   len(batch) - 1)))

    def _regroup_for_replay(self):
        g = lambda data: np.reshape(data, [len(data) * self.n_groups, -1])
        return (g(self.goals), g(self.states), g(self.features),
                g(self.actions), g(self.probs), g(self.n_goals),
                g(self.n_states), g(self.n_features), g(self.credits),
                self.discounts.reshape(-1, 1))

    def _do_sampling(self):
        if max(len(self.replay_buffer), len(self.states)) < self.batch_size:
            return None

        if not len(self.states):
            return None

        batch = np.vstack(zip(*self._regroup_for_replay()))

        for i in range(self.learning_repeat):
            self._encoder_freeze_schedule()

            samples = self._select()
            # keep an eye on latest experience
            population = [] if (not self.fresh_frac
                                or i % self.fresh_frac) else self._population(
                                    batch, self.batch_size // self.fresh_frac)
            mini_batch = batch[population] if None == samples else np.vstack(
                [samples, batch[population]])
            yield mini_batch.T

    def _encoder_freeze_schedule(self):
        if not self.freeze_delta:
            return
        self.iter += (0 == self.freezed)
        if self.iter % self.freeze_delta:
            return
        if not self.freezed:
            self.brain.freeze_encoders()
        self.freezed += 1
        if self.freezed <= self.freeze_count:
            return
        self.freezed = 0
        self.brain.unfreeze_encoders()

    def _select(self):
        if len(self.replay_buffer) < self.batch_size:
            return None

        batch = self.replay_buffer.sample(self.batch_size, self)
        if None == batch:
            return None

        goals, states, features, actions, probs, _, n_goals, n_states, n_features, credits, discounts, _, _ = batch
        batch = (goals, states, features, actions, probs, n_goals, n_states,
                 n_features, credits, discounts)

        self._update_replay_prios(batch)

        return np.vstack(zip(*batch))

    def _update_replay_prios(self, batch):
        # maybe better to do sum(1) but then we need to propagate this when adding to replay buffer as well ( aka .sum(2).mean(2) )
        prios = self.brain.novelity(batch).mean(
            1)  # mean trough rewards for objectives
        # seems we are bit too far for PG ( PPO ) to do something good, replay buffer should abandon those
        if self.replay_cleaning:
            (states, actions,
             goals) = (np.vstack(batch[1]), np.vstack(batch[3]),
                       np.vstack(batch[0]))

            new_probs = self.brain.reevaluate(goals, states, actions)
            prios[self.prob_treshold[1] < np.abs(new_probs.mean(-1))] = 1e-10
            prios[self.prob_treshold[0] > np.abs(new_probs.mean(-1))] = 1e-10

        self.replay_buffer.update(prios)

    def _assign_credit(self, n_groups, n_steps, rewards, goals, states,
                       features, actions, stochastic):
        regroup = lambda data: np.reshape(data, [len(data), n_groups, -1])

        grouped_rewards = regroup(rewards)
        if not self.gae:
            return policy.k_discount(n_steps, self.gamma), np.concatenate([
                policy.k_step(n_steps, grouped_rewards[:, i], self.gamma)
                for i in range(n_groups)
            ], 1)

        grouped_goals = regroup(goals)
        grouped_states = regroup(states)
        grouped_features = regroup(features)
        grouped_actions = regroup(actions)

        return policy.gae_discount(
            n_steps, self.gamma, self.gae_tau), np.concatenate([
                policy.gae(
                    n_steps, grouped_rewards[:len(grouped_states) - 1, i],
                    self.brain.qa_future(
                        grouped_goals[:, i], grouped_states[:, i],
                        grouped_features[:, i], grouped_actions[:, i]),
                    self.gamma, self.gae_tau) for i in range(n_groups)
            ], 1)

    def _random_n_step_her(self, length, inds):
        do_n_step = lambda n: self.n_step if not self.floating_step else random.randint(
            n, self.n_step)
        n_step = lambda i: 1 if inds[i] else do_n_step(
            1 if length - 1 > i + self.n_step else (length - i - 1))
        return self._do_random_n_step(length, n_step)

    def _random_n_step(self, length):
        do_n_step = lambda n: self.n_step if not self.floating_step else random.randint(
            n, self.n_step)
        n_step = lambda i: do_n_step(1 if length - 1 > i + self.n_step else
                                     (length - i - 1))
        return self._do_random_n_step(length, n_step)

    def _do_random_n_step(self, length, n_step):
        n_steps = [n_step(i) for i in range(length - self.n_step)]
        indices = np.asarray(n_steps) + np.arange(len(n_steps))
        indices = np.hstack([indices, self.n_step * [-1]])
        return n_steps, indices

    def _redistribute_rewards(self, n_groups, n_steps, indices, rewards, goals,
                              states, features, actions, stochastic):
        # n-step, n-discount, n-return - Q(last state)
        discounts, credits = self._assign_credit(n_groups, n_steps, rewards,
                                                 goals, states, features,
                                                 actions, stochastic)

        discounts = np.hstack([discounts, self.n_step * [0]])
        credits = np.vstack(
            [credits, np.zeros([self.n_step, len(credits[0])])])

        return (  # for self-play
            credits,
            discounts,
            goals[indices],
            states[indices],
            features[indices],
        )

    def _inject(self, finished, exp):
        goals, states, features, actions, probs, rewards, goods = exp
        if not len(states):
            return  # can happen at the end of episode, we just handle it as a notification

        self.steps += len(states) - (self.n_step if not finished else 1)

        n_steps, n_indices = *self._random_n_step(len(rewards)),
        credits, discounts, n_goals, n_states, n_features = self._redistribute_rewards(
            self.n_groups,
            n_steps,
            n_indices,
            rewards,
            goals,
            states,
            features,
            actions,
            stochastic=True)

        if not finished:  # scatter overlaping info
            goods = goods[:-self.n_step]
            goals = goals[:-self.n_step]
            states = states[:-self.n_step]
            features = features[:-self.n_step]
            actions = actions[:-self.n_step]
            probs = probs[:-self.n_step]
            rewards = rewards[:-self.n_step]
            n_goals = n_goals[:-self.n_step]
            n_states = n_states[:-self.n_step]
            n_features = n_features[:-self.n_step]
            credits = credits[:-self.n_step]
            discounts = discounts[:-self.n_step]
            # this branch was not properly evaluated imho ...

        self.goods = np.vstack([self.goods, goods]) if len(
            self.goods) else goods
        self.goals = np.vstack([self.goals, goals]) if len(
            self.goals) else goals
        self.states = np.vstack([self.states, states]) if len(
            self.states) else states
        self.features = np.vstack([self.features, features]) if len(
            self.features) else features
        self.actions = np.vstack([self.actions, actions]) if len(
            self.actions) else actions
        self.probs = np.vstack([self.probs, probs]) if len(
            self.probs) else probs
        self.rewards = np.vstack([self.rewards, rewards]) if len(
            self.rewards) else rewards
        self.n_goals = np.vstack([self.n_goals, n_goals]) if len(
            self.n_goals) else n_goals
        self.n_states = np.vstack([self.n_states, n_states]) if len(
            self.n_states) else n_states
        self.n_features = np.vstack([self.n_features, n_features]) if len(
            self.n_features) else n_features
        self.credits = np.vstack([self.credits, credits]) if len(
            self.credits) else credits
        self.discounts = np.hstack([self.discounts, discounts]) if len(
            self.discounts) else discounts
        self.n_steps = np.hstack([self.n_steps, n_steps]) if len(
            self.n_steps) else n_steps

        self._try_learn()

    def _her(self, inds):
        collision_free = lambda i, ind: ind - self.n_step > inds[
            i - 1] and ind + 1 != inds[i + 1]
        hers = [-1] + [
            i
            for i, ind in enumerate(inds[1:-1]) if collision_free(i + 1, ind)
        ]

        pivot = 1
        indices = [inds[0]]
        hers.append(len(inds) - 1)
        for i, ind in enumerate(inds[1:]):
            if i == hers[pivot] or indices[-1] + 1 == ind or (
                    0 != random.randint(
                        0, 1 + (i - hers[pivot - 1]) // self.her_max_ratio)
                    and indices[-1] == inds[i]):
                indices.append(ind)
            if i == hers[pivot]:
                pivot += 1
        return indices

        non_her = list(filter(lambda i: i not in indices, inds))
        for i in non_her:
            for j in indices:
                assert i + self.n_step < j or i - 1 > j, "OPLA {} || {} >< {}\n>{} ({}::{})".format(
                    indices, inds, non_her, hers, i, j)

        return indices

# well TODO : refactor accessing replay buffer entry, from indexing and stacking to proper class and querying
# TODO-2 : common those nasty recalc ifs... => make two separate functions!!

    def reanalyze_experience(self, episode, indices, recalc, cross_experience):
        f, a, p, c, d, n_steps = zip(
            *[[e[0][2], e[0][3], e[0][4], e[0][9], e[0][10], e[0][11]]
              for e in episode])

        inds = sorted(indices)
        cache = np.zeros(len(episode))
        if not recalc: cache[self._her(inds)] = 1

        n_steps, n_indices = self._random_n_step_her(len(episode), cache)

        # even if we dont recalc here, HER or another REWARD 'shaper' will do its job!!
        r, g, s, n_g, n_s, active = zip(
            *self.
            update_goal(  # this will change goals not states, OK for Q-function in GAE
                *zip(*[
                    (  # magic *
                        e[0]
                        [5],  # rewards .. just so he can forward it to us back
                        e[0][0],  # goals ..
                        e[0][1],  # states ..
                        episode[n_indices[i]][0][0],  # n_goals ..
                        episode[n_indices[i]][0][1],  # n_states ..
                        #                e[0][2], # action .. well for now no need, however some emulator may need them
                        bool(cache[i]),
                        n_steps[i] if len(episode) - self.n_step > i else None,
                    ) for i, e in enumerate(episode)
                ])))

        g, s, a, p = np.asarray(g), np.asarray(s), np.asarray(a), np.asarray(p)
        if recalc:
            f = self.brain.recalc_feats(g, s, a)
        else:
            f = np.asarray(f)
        # TODO : appareantelly redistribute rewards functions should be decomposed
        c, d, _, _, _ = self._redistribute_rewards(  # here update_goal worked on its own forn of n_goal, so dont touch it here!
            1,
            n_steps,
            n_indices,
            r,
            g,
            s,
            f,
            a,
            stochastic=True)

        for i in range(len(episode)):  #indices:
            yield (
                [
                    g[i],
                    s[i],
                    f[i],
                    a[i],
                    p[i],
                    r[i],
                    n_g[i],
                    n_s[i],
                    f[n_indices[i]] if len(episode) - self.n_step > i else
                    None,  #episode[i][0][8],
                    c[i],
                    d[i],
                    n_steps[i] if len(episode) - self.n_step > i else None,
                    episode[i][0][-1]
                ],
                active[i] and episode[i][0][-1])
示例#9
0
class Zer0Bot:
    def __init__(self, bot_id, cfg, task_factory, encoder, Actor, Critic, goal_encoder):
        self.cfg = cfg
        self.bot = Bot(
                cfg,
                bot_id,
                encoder,
                goal_encoder,
                Actor,
                Critic,
                task_factory.state_size,
                task_factory.action_size,
                task_factory.wrap_action,
                task_factory.wrap_value)
        self.bot.share_memory() # !! must be done from main process !!

        self.iter = 0
        self.freezed = 0

        self.counter = 1
        self.tau = LinearSchedule(cfg['tau_replay_counter'], cfg['tau_base'], cfg['tau_final'])

        self.lock = Lock()
        self.bot = BotProxy(self.lock, cfg, self.bot, cfg['device'])

    def act(self, goal, state, history): # TODO rename to exploit
        return self.bot.exploit(goal, state, history)

    def train(self, loss, mcts, signal):
        self._encoder_freeze_schedule()
        #  seed = [random.randint(0, self.cfg['mcts_random_cap'])] * self.cfg['mcts_rounds']
        for c in mcts: # maybe we want to * n_episodes ...
            c.put([random.randint(0, self.cfg['mcts_random_cap'])] * self.cfg['mcts_rounds'])

        while all(c.empty() for c in signal):
            self._train_worker(loss)

        scores = []
        for s in signal:
            scores += s.get()
        return scores

    def _train_worker(self, loss):
        time.sleep(.1)
        status = self._update_policy(self.tau.value(), loss)
        if not status:
            return
        self.counter += 1

    def _update_policy(self, tau, loss_gate):
        if any(c.empty() for c in loss_gate):
            return False

        states, grads, actions = zip(*map(
            lambda i: self._get_grads(i, loss_gate[i].get()), range(self.cfg['n_simulations'])))

        if self.cfg["attention_enabled"]:
            # ok we will scatter additional info which is hard to be weighted w/o additional info
            gran = min(map(len, grads))
            states = np.vstack([s[:gran] for s in states])
            actions = np.vstack([a[:gran] for a in actions])
            grads = torch.cat([g[:gran] for g in grads])
        else:
            states = np.vstack(states)
            actions = np.vstack(actions)
            grads = torch.cat(grads)

        # in case of PPO it is safe to move full force
        tau = 1. if not self.cfg['ddpg'] else tau * (0 == self.counter % self.cfg['actor_update_delay'])
        self.bot.learn_actor(states, grads, actions, tau)
        return True

    def _get_grads(self, i, s_f_a_p_td):
        # this is ok to call, as we just using content which is known at creation time ( immutable )
        s, w, a = self._qa_function(i, *s_f_a_p_td)
        return s, w, a

    def _qa_function(self, objective_id, goals, states, history, actions, probs, td_targets):
        qa, dist = self.bot.q_explore(objective_id, goals, states, history)

        loss = self._qa_error(qa, td_targets)
        if self.cfg['normalize_advantages']:
            loss = policy.normalize(loss)

        probs = np.vstack(probs)
        grads = policy.policy_loss(
                torch.tensor(probs),
                dist.log_prob(torch.tensor(actions)),
                loss,
                self.cfg['ppo_eps'], self.cfg['dbgout_ratio'])

        return states, grads, actions

    def _qa_error(self, qa, td_targets):
        if not self.cfg['advantages_enabled']:
            return qa

        td_error = torch.tensor(td_targets).to(qa.device) - qa
# in case of ddpg ~ we calc advantage bit differently ~ check _eval + what is feeded here,
# turned table basically to favor of perf, basically we calculating grads w.r.t. other action
        if self.cfg['ddpg']:
            td_error = -td_error

        if not self.cfg['advantages_boost']:
            return td_error

        for i, e in enumerate(td_error):
            td_error[i] = e if abs(e) > 1e-5 else qa[i]
        return td_error

    def _encoder_freeze_schedule(self):
        if not self.cfg['freeze_delta']:
            return
        self.iter += (0 == self.freezed)
        if self.iter % self.cfg['freeze_delta']:
            return
        if not self.freezed:
            self.bot.freeze_encoders()
        self.freezed += 1
        if self.freezed <= self.cfg['freeze_count']:
            return
        self.freezed = 0
        self.bot.unfreeze_encoders()
示例#10
0
class ReplayBuffer:
    def __init__(self, cfg):
        self.cfg = cfg

        self.inds = None

        self.beta = LinearSchedule(cfg['replay_beta_iters'],
                                   initial_p=cfg['replay_beta_base'],
                                   final_p=cfg['replay_beta_top'])

        self.counter = 0
        self.timetable = [0] * cfg['replay_size']
        self.delta = cfg['freeze_count'] + cfg['freeze_delta']

        self.mem = Memory(cfg['replay_size'], cfg['select_count'],
                          cfg['replay_alpha'])

    def sample(self, batch_size, critic):
        self.inds, data = zip(*self._sample(batch_size, critic))
        # lol TODO : kick off numpy vstack transpose
        data = np.vstack(data)
        return (data.T)

    def add(self, batch, prios, hashkey):
        if len(prios) < self.cfg['n_step'] * 2:
            return
        if not self._worth_experience(prios):
            return

        self.counter = (self.counter + 1) % len(self.timetable)
        # do first update when we do first freeze
        self.timetable[
            self.
            counter] = self.counter - self.delta - self.cfg['freeze_count']
        for i, data in enumerate(batch):
            self.mem.add(
                [data, i, len(prios) - i - 1, hashkey, self.counter], prios[i])

    def _worth_experience(self, prios):
        if not len(self):
            return True
        if len(self) < self.cfg['replay_size']:
            return True
        for _ in range(10):
            data = self.mem.select(1.)
            if None == data:
                return True
            _, w, _ = data
            status = prios.mean() > np.mean(w)
            if status:
                return True
        return 0 == random.randint(0, 4)

    def _sample(self, batch_size, critic):
        count = 0
        while not count:
            data = self.mem.select(self.beta.value())
            if None == data:
                continue
            batch, _, inds = data
            if None == batch:
                continue
            data, local_forward, local_backward, hashkey, timestamp = zip(
                *batch)

            uniq = set(
                map(lambda i_b: i_b[0] - i_b[1], zip(inds, local_forward)))
            for i, b, f, k, t in zip(inds, local_backward, local_forward,
                                     hashkey, timestamp):
                #                if count >= self.cfg['max_ep_draw_count']:
                #                    break
                pivot = i - f
                if pivot < 0 or pivot + b + f > len(self):
                    continue  # temporarely we want to avoid this corner case .. TODO
                if pivot not in uniq:
                    continue
                uniq.remove(pivot)
                #                yield (i, self.mem.tree.data[i][0])
                #                continue
                count += 1
                yield zip(*self._do_sample_wrap(pivot, b + f, critic, k, t))

    def _do_sample_wrap(self, pivot, length, critic, hashkey, timestamp):
        return self._do_sample(self.mem.tree.data[pivot:pivot + length], pivot,
                               length, critic, hashkey, timestamp)

    def _do_sample(self, full_episode, pivot, length, critic, _, timestamp):
        available_range = range(length)

        top = min(len(available_range), self.cfg['max_ep_draw_count'])
        replay = random.sample(available_range, random.randint(1, top))

        recalc = abs(self.timetable[timestamp] - self.counter) > self.delta * 2

        if not critic or not self.cfg['replay_reanalyze']:
            episode = map(lambda i: full_episode[i][0], replay)
        else:
            episode = critic.reanalyze_experience(full_episode, replay, recalc)

        if recalc:
            self.timetable[timestamp] = self.counter

        for i, step in zip(replay, episode):
            yield pivot + i, step

    def update(self, prios):
        '''
        replay buffer must be single thread style access, or properly locked ...
          ( sample, update, add )
          well in theory as it is not expanding, we dont care much of reads only .. for now lol ..
        '''
        self.mem.priority_update(np.hstack(self.inds), prios)
        self.inds = None

    def __len__(self):
        return len(self.mem)