def __init__(self, cfg): self.cfg = cfg self.inds = None self.beta = LinearSchedule(cfg['replay_beta_iters'], initial_p=cfg['replay_beta_base'], final_p=cfg['replay_beta_top']) self.counter = 0 self.timetable = [0] * cfg['replay_size'] self.delta = cfg['freeze_count'] + cfg['freeze_delta'] self.mem = Memory(cfg['replay_size'], cfg['select_count'], cfg['replay_alpha'])
def __init__(self, n_step, replay_reanalyze, buffer_size, select_count, max_ep_draw_count, alpha, beta_base, beta_end, beta_horizon, recalc_delta): self.batch_count = 0 self.inds = None self.beta = LinearSchedule(beta_horizon, beta_base, beta_end) self.counter = 0 self.ts = 0 self.timetable = [0] * buffer_size self.n_step = n_step self.max_ep_draw_count = max_ep_draw_count self.delta = recalc_delta self.replay_reanalyze = replay_reanalyze self.buffer_size = buffer_size self.mem = Memory(buffer_size, select_count, alpha)
def __init__(self, bot_id, cfg, task_factory, encoder, Actor, Critic, goal_encoder): self.cfg = cfg self.bot = Bot( cfg, bot_id, encoder, goal_encoder, Actor, Critic, task_factory.state_size, task_factory.action_size, task_factory.wrap_action, task_factory.wrap_value) self.bot.share_memory() # !! must be done from main process !! self.iter = 0 self.freezed = 0 self.counter = 1 self.tau = LinearSchedule(cfg['tau_replay_counter'], cfg['tau_base'], cfg['tau_final']) self.lock = Lock() self.bot = BotProxy(self.lock, cfg, self.bot, cfg['device'])
def __init__(self, cfg, bot, objective_id, task_factory, update_goal): assert not cfg['gae'] or cfg[ 'n_step'] == 1, "gae is currently enabled only with one step lookahead!" self.cfg = cfg self.objective_id = objective_id self.bot = bot self.update_goal = update_goal self.stop = False self.debug_out_ex = "y" * 10 self.n_step = self.cfg['n_step'] self.discount = self.cfg['discount_rate'] self.n_discount = 1. if self.cfg['gae'] else (self.discount** self.n_step) self.batch_size = self.cfg['batch_size'] self.counter = 0 self.tau = LinearSchedule(cfg['tau_replay_counter'], initial_p=self.cfg['tau_base'], final_p=cfg['tau_final']) self.replay = task_factory.make_replay_buffer(cfg) self.full_episode = [] self.last_train_cap = self.cfg['critic_learn_delta'] # here imho configurable choise : use curiosity, td errors, random, or another method self.curiosity = CuriosityPrio(task_factory.state_size, task_factory.action_size, task_factory.action_range, task_factory.wrap_action, cfg['device'], cfg)
class ReplayBuffer: def __init__(self, n_step, replay_reanalyze, buffer_size, select_count, max_ep_draw_count, alpha, beta_base, beta_end, beta_horizon, recalc_delta): self.batch_count = 0 self.inds = None self.beta = LinearSchedule(beta_horizon, beta_base, beta_end) self.counter = 0 self.ts = 0 self.timetable = [0] * buffer_size self.n_step = n_step self.max_ep_draw_count = max_ep_draw_count self.delta = recalc_delta self.replay_reanalyze = replay_reanalyze self.buffer_size = buffer_size self.mem = Memory(buffer_size, select_count, alpha) def sample(self, batch_size, critic): self.inds, data = zip(*self._sample(batch_size, critic)) # lol TODO : kick off numpy vstack transpose data = np.vstack(data) return (data.T) def add(self, batch, prios, hashkey): if len(prios) < self.n_step * 2: return if not self._worth_experience(prios): return self.ts = (self.ts + 1) % len(self.timetable) # do first update when we do first freeze self.timetable[self.ts] = self.counter - self.delta for i, data in enumerate(batch): self.mem.add( [np.asarray(data), i, len(prios) - i - 1, hashkey, self.ts], prios[i]) def _worth_experience(self, prios): # return True if not len(self): return True if len(self) < self.buffer_size: return True for _ in range(10): data = self.mem.select(1.) if None == data: return True _, w, _ = data status = prios.mean() > np.mean(w) if status: return True return 0 == random.randint(0, 4) def _sample(self, batch_size, critic): """ sampling should be multithreaded ~ mainly recalc """ self.counter += 1 self.batch_count = 0 while self.batch_count < batch_size: data = self.mem.select(self.beta.value()) if None == data: continue batch, _, inds = data if None == batch: continue _, local_forward, local_backward, hashkey, timestamp = zip(*batch) uniq = set( map(lambda i_b: i_b[0] - i_b[1], zip(inds, local_forward))) for i, b, f, k, t in zip(inds, local_backward, local_forward, hashkey, timestamp): pivot = i - f if pivot < 0 or pivot + b + f > len(self): continue # temporarely we want to avoid this corner case .. TODO if pivot not in uniq: continue if 0 != self.mem.tree.data[pivot][1]: continue assert pivot + self.mem.tree.data[pivot][ 2] == i + b, "--> {} {} :: {} {}".format( pivot, [(x[1], x[2]) for x in self.mem.tree.data[pivot:i]], i, b) uniq.remove(pivot) bc = self.batch_count data = zip(*self._do_sample_wrap(pivot, b + f, critic, k, t)) if bc == self.batch_count: continue yield data def _do_sample_wrap(self, pivot, length, critic, hashkey, timestamp): return self._do_sample(self.mem.tree.data[pivot:pivot + length], pivot, length, critic, hashkey, timestamp) # TODO :: REFACTOR! too complex logic, from replasy-reanal to recalc updates n timetable .. def _do_sample(self, full_episode, pivot, length, critic, hashkey, timestamp): available_range = range(length - self.n_step) top = min(len(available_range), self.max_ep_draw_count) replay = random.sample(available_range, random.randint(1, top)) recalc = abs(self.timetable[timestamp] - self.counter) >= self.delta if not critic or not self.replay_reanalyze: episode_approved = map(lambda i: (full_episode[i][0], i in replay), range(length)) else: episode_approved = critic.reanalyze_experience( full_episode, replay, recalc, -1 == timestamp) if recalc: self.timetable[timestamp] = self.counter for i, step_good in enumerate(episode_approved): step, good = step_good if recalc and -1 != timestamp and self.replay_reanalyze: self.mem.tree.data[pivot + i][0][...] = np.asarray(step) if i not in replay: continue if not good: continue self.batch_count += 1 yield pivot + i, step def update(self, prios): ''' replay buffer must be single thread style access, or properly locked ... ( sample, update, add ) well in theory as it is not expanding, we dont care much of reads only .. for now lol .. ''' self.mem.priority_update(np.hstack(self.inds), prios) self.inds = None def __len__(self): return len(self.mem)
class Critic: def __init__(self, cfg, bot, objective_id, task_factory, update_goal): assert not cfg['gae'] or cfg[ 'n_step'] == 1, "gae is currently enabled only with one step lookahead!" self.cfg = cfg self.objective_id = objective_id self.bot = bot self.update_goal = update_goal self.stop = False self.debug_out_ex = "y" * 10 self.n_step = self.cfg['n_step'] self.discount = self.cfg['discount_rate'] self.n_discount = 1. if self.cfg['gae'] else (self.discount** self.n_step) self.batch_size = self.cfg['batch_size'] self.counter = 0 self.tau = LinearSchedule(cfg['tau_replay_counter'], initial_p=self.cfg['tau_base'], final_p=cfg['tau_final']) self.replay = task_factory.make_replay_buffer(cfg) self.full_episode = [] self.last_train_cap = self.cfg['critic_learn_delta'] # here imho configurable choise : use curiosity, td errors, random, or another method self.curiosity = CuriosityPrio(task_factory.state_size, task_factory.action_size, task_factory.action_range, task_factory.wrap_action, cfg['device'], cfg) def training_loop(self, ping, sync, share_gate, loss_gate, stats): while True: exp = share_gate.get() if None == exp: break full, action, exp = exp if not full: self._inject(action, exp) else: self._train(loss_gate, ping, stats, action, exp) if not self.cfg['critic_learn_delta']: continue if len(self.full_episode) < self.last_train_cap: continue self.last_train_cap += self.cfg['critic_learn_delta'] # print("\n%s\nDO FAST TRAIN : %i\n%s\n"%('*' * 60, len(self.full_episode), '*' * 60)) ping.put( True) # old style scoping ... python nicer way out there ? for batch in self._do_sampling(): self._eval(loss_gate, stats, batch) ping.get() self._dtor(ping, sync, stats) def _dtor(self, ping, sync, stats): self.stop = True while not ping.empty(): time.sleep(.1) while not stats.empty(): stats.get() sync.put(True) def _inject(self, action, exp): goals, states, features, actions, probs, rewards, n_goals, n_states, n_features, good = exp if not len(states): return n_rewards = policy.td_lambda( rewards, self.n_step, self.discount) if not self.cfg['gae'] else policy.gae( rewards, self.bot.qa_future( self.objective_id, np.vstack([goals, [goals[-1]]]).reshape( len(goals) + 1, -1), np.vstack([states, n_states[-1]]), np.vstack([features, [n_features[-1]]]), np.vstack([actions, [action]])), self.discount, self.cfg['gae_tau'], stochastic=False) full_episode = np.vstack( zip(*[ goals, states, features, actions, probs, rewards, n_goals, n_states, n_features, n_rewards, good ])) if not len(self.full_episode): self.full_episode = full_episode else: self.full_episode = np.vstack([self.full_episode, full_episode]) def _train(self, loss_gate, ping, stats, action, exp): self._inject(action, exp) self._update_memory() self._self_play(loss_gate, ping, stats) # abandoned reinforce clip, as i think that is no go for AGI... # print("\n%s\nFULL EPISODE LENGTH : %i\n%s\n"%('*' * 60, len(self.full_episode), '*' * 60)) self.full_episode = [] self.last_train_cap = self.cfg['critic_learn_delta'] def _self_play(self, loss_gate, ping, stats): ping.put(True) for _ in range(self.cfg['full_replay_count']): samples = self._select() if None == samples: continue self._eval(loss_gate, stats, samples.T) ping.get() def _update_memory(self): goals, states, features, actions, probs, rewards, n_goals, n_states, n_features, n_rewards, good = self.full_episode.T goals, states, n_goals, n_states, actions = np.vstack( goals), np.vstack(states), np.vstack(n_goals), np.vstack( n_states), np.vstack(actions) prios = self.curiosity.weight(states, n_states, actions) self.replay.add( map( lambda i: (goals[i], states[i], features[i], actions[i], probs[ i], rewards[i], n_goals[i], n_states[i], n_features[i], n_rewards[i]), filter(lambda i: bool(sum(good[i:i + self.cfg['good_reach']])), range(len(states)))), prios, hash(states.tostring())) self.curiosity.update(states, n_states, actions) def _eval(self, loss_gate, stats, args): if self.stop: return goals, states, features, actions, probs, n_goals, n_states, n_features, n_rewards = args assert len(n_features) == len(features), "features missmatch" if len(n_features) != len(features): return goals, states, features, actions = np.vstack(goals), np.vstack( states), np.vstack(features), np.vstack(actions) n_goals, n_states, n_features, n_rewards = np.vstack( n_goals), np.vstack(n_states), np.vstack(n_features), np.vstack( n_rewards) # func approximators; self play n_qa = self.bot.q_future(self.objective_id, n_goals, n_states, n_features) # n_step target + bellman equation td_targets = n_rewards + self.n_discount * n_qa # learn !! self.counter += 1 self.bot.learn_critic( self.objective_id, goals, states, features, actions, td_targets, self.tau.value() * (0 == self.counter % self.cfg['critic_update_delay'])) # propagate back to simulation ~ debug purposes if None != stats and self.cfg['dbgout'] and not self.stop: stats.put("[ TARGET:{:2f} replay::{} ]<----".format( td_targets[-1].item(), len(self.replay))) # propagate back to main process loss_gate.put([goals, states, features, actions, probs, td_targets]) # WARNING : EXPERIMENT ~~> here we on purpose provide same features as for n-state # basically we are leaking future of that trajectory, what our agent will do ? # bellman will be probably not proud of me at this point :) #loss_gate.put([ goals, states, n_features, actions, probs, td_targets ]) def _population(self, batch): return random.sample( range(len(batch)), random.randint(1, min(2 * self.cfg['batch_size'], len(batch) - 1))) def _do_sampling(self): if self.stop: return batch = self._fast_exp() if None == batch: return # first_order_experience_focus = ''' for _ in range(self.cfg['fast_exp_epochs']): samples = self._select() mini_batch = batch if None == samples else np.vstack( [batch, samples]) population = self._population(mini_batch) yield mini_batch[population].T replay_focused = ''' for _ in range(self.cfg['fast_exp_epochs']): population = self._population(batch) samples = self._select() if None != samples: yield np.vstack([batch[population], samples]).T else: yield batch[population].T # ''' population = self._population(batch) yield batch[population].T # push towards latest experience def _fast_exp(self): if max(len(self.replay), len(self.full_episode)) < self.batch_size: return None goals, states, features, actions, probs, _, n_goals, n_states, n_features, n_rewards, _ = self.full_episode.T return np.vstack( zip(goals, states, features, actions, probs, n_goals, n_states, n_features, n_rewards)) def _select(self): if len(self.replay) < self.batch_size: return None data = self.replay.sample(self.batch_size, self) if None == data: return None goals, states, features, actions, probs, _, n_goals, n_states, n_features, n_rewards = data if not len(actions): return None self._update_replay_prios(states, n_states, actions) return np.vstack( zip(goals, states, features, actions, probs, n_goals, n_states, n_features, n_rewards)) def _update_replay_prios(self, states, n_states, actions): if not self.cfg['replay_cleaning']: return states, n_states, actions = np.vstack(states), np.vstack( n_states), np.vstack(actions) prios = self.curiosity.weight(states, n_states, actions) # seems we are bit too far for PG ( PPO ) to do something good, replay buffer should abandon those prios[ self.cfg['prob_treshold'] < np.abs(np.vstack(probs).mean(-1))] = 0 self.replay.update(prios) # main bottleneck of whole solution, but we experimenting so .. :) # also i think can be played with, when enough hardware/resources # -> properly scale, and do thinks on background in paralell.. # + if main concern is speed i would not do it in python in first place .. def reanalyze_experience(self, episode, indices, recalc): # imho i iterate too much trough episode ... better to implement it in one sweep ... TODO goals, states, f, a, p = zip( *[[e[0][0], e[0][1], e[0][2], e[0][3], e[0][4]] for e in episode]) goals, states = np.asarray(goals), np.asarray(states) if recalc: f, p = self.bot.reevaluate(self.objective_id, goals, states, a) r, g, s, n_g, n_s = zip(*self.update_goal(*zip(*[ ( # magic * e[0][5], # rewards .. just so he can forward it to us back e[0][0], # goals .. e[0][1], # states .. e[0][6], # n_goals .. e[0][7], # n_states .. # e[0][2], # action .. well for now no need, however some emulator may need them bool(random.randint(0, self.cfg['her_max_ratio']) ), # update or not ) for e in episode ]))) n = [e[0][9] for e in episode ] if not recalc or not self.cfg['gae'] else policy.gae( r, self.bot.qa_future(self.objective_id, goals, states, np.asarray(f), np.asarray(a)), self.discount, self.cfg['gae_tau']) for i in indices: yield (g[i], s[i], f[i], a[i], p[i], r[i], n_g[i], n_s[i], f[(i + self.n_step) if i + self.n_step < len(f) else -1], n[i])
def __init__( self, brain, replay_buffer, update_goal, # HER / RNN / GAE essential for ReplayBuffer n_groups, n_step, floating_step, gamma, good_reach, sync_delta, learning_delay, learning_repeat, batch_size, # immidiate learning fresh_frac, optim_epochs, replay_cleaning, prob_treshold, # PPO her_max_ratio, gae, gae_tau, tau_replay_counter, tau_base, tau_final, freeze_delta, freeze_count, ): # TODO : add to config ... self.fresh_frac = fresh_frac self.optim_epochs = optim_epochs self.floating_step = floating_step self.brain = brain self.update_goal = update_goal # encoders freezing! self.freeze_delta = freeze_delta self.freeze_count = freeze_count self.iter = 0 self.freezed = 0 self.n_groups = n_groups self.n_step = n_step self.gamma = gamma self.good_reach = good_reach self.learning_delay = learning_delay self.sync_delta = sync_delta self.learning_repeat = learning_repeat self.batch_size = batch_size self.replay_cleaning = replay_cleaning self.prob_treshold = prob_treshold self.her_max_ratio = her_max_ratio self.gae = gae self.gae_tau = gae_tau self.replay_buffer = replay_buffer self.tau = LinearSchedule(tau_replay_counter, initial_p=tau_base, final_p=tau_final) self.counter = 0 self.reset() self.steps = 0 self.last_train_cap = self.learning_delay
class Agent: def __init__( self, brain, replay_buffer, update_goal, # HER / RNN / GAE essential for ReplayBuffer n_groups, n_step, floating_step, gamma, good_reach, sync_delta, learning_delay, learning_repeat, batch_size, # immidiate learning fresh_frac, optim_epochs, replay_cleaning, prob_treshold, # PPO her_max_ratio, gae, gae_tau, tau_replay_counter, tau_base, tau_final, freeze_delta, freeze_count, ): # TODO : add to config ... self.fresh_frac = fresh_frac self.optim_epochs = optim_epochs self.floating_step = floating_step self.brain = brain self.update_goal = update_goal # encoders freezing! self.freeze_delta = freeze_delta self.freeze_count = freeze_count self.iter = 0 self.freezed = 0 self.n_groups = n_groups self.n_step = n_step self.gamma = gamma self.good_reach = good_reach self.learning_delay = learning_delay self.sync_delta = sync_delta self.learning_repeat = learning_repeat self.batch_size = batch_size self.replay_cleaning = replay_cleaning self.prob_treshold = prob_treshold self.her_max_ratio = her_max_ratio self.gae = gae self.gae_tau = gae_tau self.replay_buffer = replay_buffer self.tau = LinearSchedule(tau_replay_counter, initial_p=tau_base, final_p=tau_final) self.counter = 0 self.reset() self.steps = 0 self.last_train_cap = self.learning_delay def reset(self): (self.goals, self.states, self.features, self.actions, self.probs, self.rewards, self.n_goals, self.n_states, self.n_features, self.credits, self.discounts, self.goods, self.n_steps) = ([], [], [], [], [], [], [], [], [], [], [], [], []) def training_loop(self, share_gate): while True: exp = share_gate.get() if None == exp: break self.train(exp) def train(self, exp): finished, exp = exp if not finished: self._inject(False, exp) else: self._finish(exp) def _try_learn(self): if self.steps < self.last_train_cap: return self.last_train_cap += self.learning_delay for batch in self._do_sampling(): self.counter += 1 for _ in range(self.optim_epochs): self.brain.learn( batch, self.tau.value() if (0 == self.counter % self.sync_delta) else 0) def _finish(self, exp): self._inject(True, exp) self._update_memory() self.reset() def _regroup_for_memory(self): g = lambda data: data.reshape(len(data), self.n_groups, -1) return (g(self.goals), g(self.states), g(self.features), g(self.actions), g(self.probs), g(self.rewards), g(self.n_goals), g(self.n_states), g(self.n_features), g(self.credits), self.discounts.reshape(-1, 1), g(self.goods)) def _update_memory(self): batch = self._regroup_for_replay() prios = self.brain.novelity(batch) assert len(self.n_steps) == len(self.discounts) - self.n_step g, s, f, a, p, r, n_g, n_s, n_f, n_r, n_d, good = self._regroup_for_memory( ) prios = np.reshape(prios, [len(g), self.n_groups, -1]).mean(2) # per group for i in range(g.shape[1]): self.replay_buffer.add( map( lambda j: (g[j, i], s[j, i], f[j, i], a[j, i], p[j, i], r[ j, i], n_g[j, i], n_s[j, i], n_f[j, i], n_r[j, i], n_d[ j], self.n_steps[j] if j < len(n_d) - self.n_step else None, bool((j < len(s) - self.n_step) and good[ j:j + self.good_reach, i].sum())), range(len(s))), prios[:, i], hash(s[:i:].tostring())) def _population(self, batch, limit): return random.sample(range(len(batch)), random.randint(1, min(limit, len(batch) - 1))) def _regroup_for_replay(self): g = lambda data: np.reshape(data, [len(data) * self.n_groups, -1]) return (g(self.goals), g(self.states), g(self.features), g(self.actions), g(self.probs), g(self.n_goals), g(self.n_states), g(self.n_features), g(self.credits), self.discounts.reshape(-1, 1)) def _do_sampling(self): if max(len(self.replay_buffer), len(self.states)) < self.batch_size: return None if not len(self.states): return None batch = np.vstack(zip(*self._regroup_for_replay())) for i in range(self.learning_repeat): self._encoder_freeze_schedule() samples = self._select() # keep an eye on latest experience population = [] if (not self.fresh_frac or i % self.fresh_frac) else self._population( batch, self.batch_size // self.fresh_frac) mini_batch = batch[population] if None == samples else np.vstack( [samples, batch[population]]) yield mini_batch.T def _encoder_freeze_schedule(self): if not self.freeze_delta: return self.iter += (0 == self.freezed) if self.iter % self.freeze_delta: return if not self.freezed: self.brain.freeze_encoders() self.freezed += 1 if self.freezed <= self.freeze_count: return self.freezed = 0 self.brain.unfreeze_encoders() def _select(self): if len(self.replay_buffer) < self.batch_size: return None batch = self.replay_buffer.sample(self.batch_size, self) if None == batch: return None goals, states, features, actions, probs, _, n_goals, n_states, n_features, credits, discounts, _, _ = batch batch = (goals, states, features, actions, probs, n_goals, n_states, n_features, credits, discounts) self._update_replay_prios(batch) return np.vstack(zip(*batch)) def _update_replay_prios(self, batch): # maybe better to do sum(1) but then we need to propagate this when adding to replay buffer as well ( aka .sum(2).mean(2) ) prios = self.brain.novelity(batch).mean( 1) # mean trough rewards for objectives # seems we are bit too far for PG ( PPO ) to do something good, replay buffer should abandon those if self.replay_cleaning: (states, actions, goals) = (np.vstack(batch[1]), np.vstack(batch[3]), np.vstack(batch[0])) new_probs = self.brain.reevaluate(goals, states, actions) prios[self.prob_treshold[1] < np.abs(new_probs.mean(-1))] = 1e-10 prios[self.prob_treshold[0] > np.abs(new_probs.mean(-1))] = 1e-10 self.replay_buffer.update(prios) def _assign_credit(self, n_groups, n_steps, rewards, goals, states, features, actions, stochastic): regroup = lambda data: np.reshape(data, [len(data), n_groups, -1]) grouped_rewards = regroup(rewards) if not self.gae: return policy.k_discount(n_steps, self.gamma), np.concatenate([ policy.k_step(n_steps, grouped_rewards[:, i], self.gamma) for i in range(n_groups) ], 1) grouped_goals = regroup(goals) grouped_states = regroup(states) grouped_features = regroup(features) grouped_actions = regroup(actions) return policy.gae_discount( n_steps, self.gamma, self.gae_tau), np.concatenate([ policy.gae( n_steps, grouped_rewards[:len(grouped_states) - 1, i], self.brain.qa_future( grouped_goals[:, i], grouped_states[:, i], grouped_features[:, i], grouped_actions[:, i]), self.gamma, self.gae_tau) for i in range(n_groups) ], 1) def _random_n_step_her(self, length, inds): do_n_step = lambda n: self.n_step if not self.floating_step else random.randint( n, self.n_step) n_step = lambda i: 1 if inds[i] else do_n_step( 1 if length - 1 > i + self.n_step else (length - i - 1)) return self._do_random_n_step(length, n_step) def _random_n_step(self, length): do_n_step = lambda n: self.n_step if not self.floating_step else random.randint( n, self.n_step) n_step = lambda i: do_n_step(1 if length - 1 > i + self.n_step else (length - i - 1)) return self._do_random_n_step(length, n_step) def _do_random_n_step(self, length, n_step): n_steps = [n_step(i) for i in range(length - self.n_step)] indices = np.asarray(n_steps) + np.arange(len(n_steps)) indices = np.hstack([indices, self.n_step * [-1]]) return n_steps, indices def _redistribute_rewards(self, n_groups, n_steps, indices, rewards, goals, states, features, actions, stochastic): # n-step, n-discount, n-return - Q(last state) discounts, credits = self._assign_credit(n_groups, n_steps, rewards, goals, states, features, actions, stochastic) discounts = np.hstack([discounts, self.n_step * [0]]) credits = np.vstack( [credits, np.zeros([self.n_step, len(credits[0])])]) return ( # for self-play credits, discounts, goals[indices], states[indices], features[indices], ) def _inject(self, finished, exp): goals, states, features, actions, probs, rewards, goods = exp if not len(states): return # can happen at the end of episode, we just handle it as a notification self.steps += len(states) - (self.n_step if not finished else 1) n_steps, n_indices = *self._random_n_step(len(rewards)), credits, discounts, n_goals, n_states, n_features = self._redistribute_rewards( self.n_groups, n_steps, n_indices, rewards, goals, states, features, actions, stochastic=True) if not finished: # scatter overlaping info goods = goods[:-self.n_step] goals = goals[:-self.n_step] states = states[:-self.n_step] features = features[:-self.n_step] actions = actions[:-self.n_step] probs = probs[:-self.n_step] rewards = rewards[:-self.n_step] n_goals = n_goals[:-self.n_step] n_states = n_states[:-self.n_step] n_features = n_features[:-self.n_step] credits = credits[:-self.n_step] discounts = discounts[:-self.n_step] # this branch was not properly evaluated imho ... self.goods = np.vstack([self.goods, goods]) if len( self.goods) else goods self.goals = np.vstack([self.goals, goals]) if len( self.goals) else goals self.states = np.vstack([self.states, states]) if len( self.states) else states self.features = np.vstack([self.features, features]) if len( self.features) else features self.actions = np.vstack([self.actions, actions]) if len( self.actions) else actions self.probs = np.vstack([self.probs, probs]) if len( self.probs) else probs self.rewards = np.vstack([self.rewards, rewards]) if len( self.rewards) else rewards self.n_goals = np.vstack([self.n_goals, n_goals]) if len( self.n_goals) else n_goals self.n_states = np.vstack([self.n_states, n_states]) if len( self.n_states) else n_states self.n_features = np.vstack([self.n_features, n_features]) if len( self.n_features) else n_features self.credits = np.vstack([self.credits, credits]) if len( self.credits) else credits self.discounts = np.hstack([self.discounts, discounts]) if len( self.discounts) else discounts self.n_steps = np.hstack([self.n_steps, n_steps]) if len( self.n_steps) else n_steps self._try_learn() def _her(self, inds): collision_free = lambda i, ind: ind - self.n_step > inds[ i - 1] and ind + 1 != inds[i + 1] hers = [-1] + [ i for i, ind in enumerate(inds[1:-1]) if collision_free(i + 1, ind) ] pivot = 1 indices = [inds[0]] hers.append(len(inds) - 1) for i, ind in enumerate(inds[1:]): if i == hers[pivot] or indices[-1] + 1 == ind or ( 0 != random.randint( 0, 1 + (i - hers[pivot - 1]) // self.her_max_ratio) and indices[-1] == inds[i]): indices.append(ind) if i == hers[pivot]: pivot += 1 return indices non_her = list(filter(lambda i: i not in indices, inds)) for i in non_her: for j in indices: assert i + self.n_step < j or i - 1 > j, "OPLA {} || {} >< {}\n>{} ({}::{})".format( indices, inds, non_her, hers, i, j) return indices # well TODO : refactor accessing replay buffer entry, from indexing and stacking to proper class and querying # TODO-2 : common those nasty recalc ifs... => make two separate functions!! def reanalyze_experience(self, episode, indices, recalc, cross_experience): f, a, p, c, d, n_steps = zip( *[[e[0][2], e[0][3], e[0][4], e[0][9], e[0][10], e[0][11]] for e in episode]) inds = sorted(indices) cache = np.zeros(len(episode)) if not recalc: cache[self._her(inds)] = 1 n_steps, n_indices = self._random_n_step_her(len(episode), cache) # even if we dont recalc here, HER or another REWARD 'shaper' will do its job!! r, g, s, n_g, n_s, active = zip( *self. update_goal( # this will change goals not states, OK for Q-function in GAE *zip(*[ ( # magic * e[0] [5], # rewards .. just so he can forward it to us back e[0][0], # goals .. e[0][1], # states .. episode[n_indices[i]][0][0], # n_goals .. episode[n_indices[i]][0][1], # n_states .. # e[0][2], # action .. well for now no need, however some emulator may need them bool(cache[i]), n_steps[i] if len(episode) - self.n_step > i else None, ) for i, e in enumerate(episode) ]))) g, s, a, p = np.asarray(g), np.asarray(s), np.asarray(a), np.asarray(p) if recalc: f = self.brain.recalc_feats(g, s, a) else: f = np.asarray(f) # TODO : appareantelly redistribute rewards functions should be decomposed c, d, _, _, _ = self._redistribute_rewards( # here update_goal worked on its own forn of n_goal, so dont touch it here! 1, n_steps, n_indices, r, g, s, f, a, stochastic=True) for i in range(len(episode)): #indices: yield ( [ g[i], s[i], f[i], a[i], p[i], r[i], n_g[i], n_s[i], f[n_indices[i]] if len(episode) - self.n_step > i else None, #episode[i][0][8], c[i], d[i], n_steps[i] if len(episode) - self.n_step > i else None, episode[i][0][-1] ], active[i] and episode[i][0][-1])
class Zer0Bot: def __init__(self, bot_id, cfg, task_factory, encoder, Actor, Critic, goal_encoder): self.cfg = cfg self.bot = Bot( cfg, bot_id, encoder, goal_encoder, Actor, Critic, task_factory.state_size, task_factory.action_size, task_factory.wrap_action, task_factory.wrap_value) self.bot.share_memory() # !! must be done from main process !! self.iter = 0 self.freezed = 0 self.counter = 1 self.tau = LinearSchedule(cfg['tau_replay_counter'], cfg['tau_base'], cfg['tau_final']) self.lock = Lock() self.bot = BotProxy(self.lock, cfg, self.bot, cfg['device']) def act(self, goal, state, history): # TODO rename to exploit return self.bot.exploit(goal, state, history) def train(self, loss, mcts, signal): self._encoder_freeze_schedule() # seed = [random.randint(0, self.cfg['mcts_random_cap'])] * self.cfg['mcts_rounds'] for c in mcts: # maybe we want to * n_episodes ... c.put([random.randint(0, self.cfg['mcts_random_cap'])] * self.cfg['mcts_rounds']) while all(c.empty() for c in signal): self._train_worker(loss) scores = [] for s in signal: scores += s.get() return scores def _train_worker(self, loss): time.sleep(.1) status = self._update_policy(self.tau.value(), loss) if not status: return self.counter += 1 def _update_policy(self, tau, loss_gate): if any(c.empty() for c in loss_gate): return False states, grads, actions = zip(*map( lambda i: self._get_grads(i, loss_gate[i].get()), range(self.cfg['n_simulations']))) if self.cfg["attention_enabled"]: # ok we will scatter additional info which is hard to be weighted w/o additional info gran = min(map(len, grads)) states = np.vstack([s[:gran] for s in states]) actions = np.vstack([a[:gran] for a in actions]) grads = torch.cat([g[:gran] for g in grads]) else: states = np.vstack(states) actions = np.vstack(actions) grads = torch.cat(grads) # in case of PPO it is safe to move full force tau = 1. if not self.cfg['ddpg'] else tau * (0 == self.counter % self.cfg['actor_update_delay']) self.bot.learn_actor(states, grads, actions, tau) return True def _get_grads(self, i, s_f_a_p_td): # this is ok to call, as we just using content which is known at creation time ( immutable ) s, w, a = self._qa_function(i, *s_f_a_p_td) return s, w, a def _qa_function(self, objective_id, goals, states, history, actions, probs, td_targets): qa, dist = self.bot.q_explore(objective_id, goals, states, history) loss = self._qa_error(qa, td_targets) if self.cfg['normalize_advantages']: loss = policy.normalize(loss) probs = np.vstack(probs) grads = policy.policy_loss( torch.tensor(probs), dist.log_prob(torch.tensor(actions)), loss, self.cfg['ppo_eps'], self.cfg['dbgout_ratio']) return states, grads, actions def _qa_error(self, qa, td_targets): if not self.cfg['advantages_enabled']: return qa td_error = torch.tensor(td_targets).to(qa.device) - qa # in case of ddpg ~ we calc advantage bit differently ~ check _eval + what is feeded here, # turned table basically to favor of perf, basically we calculating grads w.r.t. other action if self.cfg['ddpg']: td_error = -td_error if not self.cfg['advantages_boost']: return td_error for i, e in enumerate(td_error): td_error[i] = e if abs(e) > 1e-5 else qa[i] return td_error def _encoder_freeze_schedule(self): if not self.cfg['freeze_delta']: return self.iter += (0 == self.freezed) if self.iter % self.cfg['freeze_delta']: return if not self.freezed: self.bot.freeze_encoders() self.freezed += 1 if self.freezed <= self.cfg['freeze_count']: return self.freezed = 0 self.bot.unfreeze_encoders()
class ReplayBuffer: def __init__(self, cfg): self.cfg = cfg self.inds = None self.beta = LinearSchedule(cfg['replay_beta_iters'], initial_p=cfg['replay_beta_base'], final_p=cfg['replay_beta_top']) self.counter = 0 self.timetable = [0] * cfg['replay_size'] self.delta = cfg['freeze_count'] + cfg['freeze_delta'] self.mem = Memory(cfg['replay_size'], cfg['select_count'], cfg['replay_alpha']) def sample(self, batch_size, critic): self.inds, data = zip(*self._sample(batch_size, critic)) # lol TODO : kick off numpy vstack transpose data = np.vstack(data) return (data.T) def add(self, batch, prios, hashkey): if len(prios) < self.cfg['n_step'] * 2: return if not self._worth_experience(prios): return self.counter = (self.counter + 1) % len(self.timetable) # do first update when we do first freeze self.timetable[ self. counter] = self.counter - self.delta - self.cfg['freeze_count'] for i, data in enumerate(batch): self.mem.add( [data, i, len(prios) - i - 1, hashkey, self.counter], prios[i]) def _worth_experience(self, prios): if not len(self): return True if len(self) < self.cfg['replay_size']: return True for _ in range(10): data = self.mem.select(1.) if None == data: return True _, w, _ = data status = prios.mean() > np.mean(w) if status: return True return 0 == random.randint(0, 4) def _sample(self, batch_size, critic): count = 0 while not count: data = self.mem.select(self.beta.value()) if None == data: continue batch, _, inds = data if None == batch: continue data, local_forward, local_backward, hashkey, timestamp = zip( *batch) uniq = set( map(lambda i_b: i_b[0] - i_b[1], zip(inds, local_forward))) for i, b, f, k, t in zip(inds, local_backward, local_forward, hashkey, timestamp): # if count >= self.cfg['max_ep_draw_count']: # break pivot = i - f if pivot < 0 or pivot + b + f > len(self): continue # temporarely we want to avoid this corner case .. TODO if pivot not in uniq: continue uniq.remove(pivot) # yield (i, self.mem.tree.data[i][0]) # continue count += 1 yield zip(*self._do_sample_wrap(pivot, b + f, critic, k, t)) def _do_sample_wrap(self, pivot, length, critic, hashkey, timestamp): return self._do_sample(self.mem.tree.data[pivot:pivot + length], pivot, length, critic, hashkey, timestamp) def _do_sample(self, full_episode, pivot, length, critic, _, timestamp): available_range = range(length) top = min(len(available_range), self.cfg['max_ep_draw_count']) replay = random.sample(available_range, random.randint(1, top)) recalc = abs(self.timetable[timestamp] - self.counter) > self.delta * 2 if not critic or not self.cfg['replay_reanalyze']: episode = map(lambda i: full_episode[i][0], replay) else: episode = critic.reanalyze_experience(full_episode, replay, recalc) if recalc: self.timetable[timestamp] = self.counter for i, step in zip(replay, episode): yield pivot + i, step def update(self, prios): ''' replay buffer must be single thread style access, or properly locked ... ( sample, update, add ) well in theory as it is not expanding, we dont care much of reads only .. for now lol .. ''' self.mem.priority_update(np.hstack(self.inds), prios) self.inds = None def __len__(self): return len(self.mem)