def learn(self, num_iter=100, temperature=1., print_lag=None): for it in range(num_iter): dqn = self.dqn_mt bprop = self.bprop samples = prob.choice(self.experiences, self.minibatch_size, replace=True) # draw with replacement. # sample a minibatch. is_valids = [] targets = [] states = [] actions = np.zeros(self.minibatch_size, dtype=int) for idx, sample in enumerate(samples): # randomly choose a goal. goal = prob.choice(self.goals, 1)[0] dqn = self.dqn_by_goal[goal] state, last_action, next_state, reward, meta = sample valid_actions = meta['last_valid_actions'] num_actions = meta['num_actions'] raw_state = np.array(state['raw_state']) raw_state[1, goal[0], goal[1]] = 1. states.append(raw_state) is_valid = [1. for action in range(num_actions) if action in set(valid_actions)] if self.loss == 'KL': target = dqn._get_softmax_action_distribution(raw_state, temperature=temperature, valid_actions=valid_actions) elif self.loss == 'l2' or self.loss == 'l1' or self.loss == 'l1-exp': target = dqn.av(raw_state) elif self.loss == 'l1-action': target = [dqn.av(raw_state)[last_action]] is_valid = [is_valid[last_action]] is_valids.append(is_valid) targets.append(target) actions[idx] = last_action states = np.array(states) targets = np.array(targets) is_valids = np.array(is_valids) score = self.bprop(states, actions, targets, is_valids) if print_lag and print_lag > 0 and it % print_lag == 0: print 'iter = ', it, 'score = ', score
def reset(self): self.ale.reset_game() self.frame_id = 0 self.cum_reward = 0 if self.skip_frame: for frame_i in range(self.skip_frame): self.step(choice(self.valid_actions, 1)[0])
def train(self, num_iter=100): ''' supervised learning on the experience buffer. ''' states = [None] * self.minibatch_size is_valids = [None] * self.minibatch_size probs = [None] * self.minibatch_size experience = sum(self.experience.values(), []) for it in range(num_iter): samples = prob.choice(experience, self.minibatch_size, replace=True) for idx, sample in enumerate(samples): state, p, is_valid = sample states[idx] = state is_valids[idx] = is_valid probs[idx] = p # convert into numpy array. states = np.array(states) is_valids = np.array(is_valids) probs = np.array(probs) error = self.bprop(states, probs, is_valids) print 'error', error
def __init__(self, rom_path, num_frames=4, live=False, skip_frame=0, mode='normal'): self.ale = ALEInterface() if live: USE_SDL = True if USE_SDL: if sys.platform == 'darwin': import pygame pygame.init() self.ale.setBool('sound', False) # Sound doesn't work on OSX elif sys.platform.startswith('linux'): self.ale.setBool('sound', True) self.ale.setBool('display_screen', True) self.mode = mode self.live = live self.ale.loadROM(rom_path) self.num_frames = num_frames self.frames = [] self.frame_id = 0 self.cum_reward = 0 self.skip_frame = skip_frame if mode == 'small': img = T.matrix('img') self.max_pool = theano.function([img], max_pool_2d(img, [4, 4])) self.img_shape = (16, 16) else: self.img_shape = (84, 84) # image shape according to DQN Nature paper. while len(self.frames) < 4: self.step(choice(self.valid_actions, 1)[0]) self.reset()
def run(self, task=None, num_epochs=10, num_episodes=100, tol=1e-4): if task: self.reset(task) task = self.last_task for ei in range(num_epochs): # run DQN on task for #episodes. self.run_task(task, num_episodes=num_episodes, tol=tol) task.reset() # compute average td error after learning. ex_buffer = self._filter_experience_by_task(task) td = self._average_td_error(ex_buffer) # learn the meta-model. feat = self.feat_func(task) self.meta_model.learn(feat, td) # sample a new task based on the meta-model. task_nb = self.edit_func(task) task_nb.append(task) # include this task. val_nb = [] for new_task in task_nb: new_task_feat = self.feat_func(new_task) val_nb.append(self.meta_model.get(new_task_feat)) print 'val_nb', val_nb log_prob = prob.normalize_log(np.array(val_nb) * 1.) p = np.exp(log_prob) print 'probability', p next_task = prob.choice(task_nb, 1, replace=True, p=p)[0] print 'new_task', next_task task = next_task
def _update_net(self): ''' sample from the memory dataset and perform gradient descent on (target - Q(s, a))^2 ''' # don't update the network until sufficient experience has been # accumulated # removing this might cause correlation for early samples. suggested to be used in curriculums. #if len(self.experience) < self.memory_size: # return for nn_bi in range(self.nn_num_batch): states = [None] * self.minibatch_size next_states = [None] * self.minibatch_size actions = np.zeros(self.minibatch_size, dtype=int) rewards = np.zeros(self.minibatch_size) nvas = [] # sample and process minibatch # samples = random.sample(self.experience, self.minibatch_size) # draw without replacement. samples = prob.choice(self.experience, self.minibatch_size, replace=True) # draw with replacement. terminals = [] for idx, sample in enumerate(samples): state, action, next_state, reward, nva = sample states[idx] = state actions[idx] = action rewards[idx] = reward nvas.append(nva) if next_state is not None: next_states[idx] = next_state else: next_states[idx] = state terminals.append(idx) # convert states into tensor. states = np.array(states) next_states = np.array(next_states) # compute target reward + \gamma max_{a'} Q(ns, a') # Ensure target = reward when NEXT_STATE is terminal next_qvals = self.dqn.fprop(next_states) next_vs = np.zeros(self.minibatch_size) for idx in range(self.minibatch_size): if idx not in terminals: next_vs[idx] = np.max(next_qvals[idx, nvas[idx]]) targets = rewards + self.gamma * next_vs ## diagnostics. #print 'targets', targets #print 'next_qvals', next_qvals #print 'pure prop', self.dqn.fprop(states) #print 'prop', self.dqn.fprop(states)[range(states.shape[0]), actions] #print 'actions', actions nn_error = [] for nn_it in range(self.nn_num_iter): error = self.bprop(states, actions, targets.flatten()) nn_error.append(float(error)) self.diagnostics['nn-error'].append(nn_error)
def run(self, tasks, num_epochs=1, num_episodes=1): for ei in range(num_epochs): t = self.t # task selection. if t == 0: # no prior experience, choose randomly. task = prob.choice(tasks, 1)[0] else: # GP-t. N = len(self.ims) KXX = np.zeros((N, N)) y = np.zeros(N) for (t_i, (task_i, im_i)) in enumerate(self.ims): for (t_j, (task_j, im_j)) in enumerate(self.ims): KXX[t_i, t_j] = self.kernel_func(t_i, task_i, t_j, task_j) for (ti, (task_i, im_i)) in enumerate(self.ims): y[ti] = im_i M = len(tasks) KXsX = np.zeros((M, N)) KXsXs = np.zeros((M, M)) for (t_i, task_i) in enumerate(tasks): for (t_j, (task_j, im_j)) in enumerate(self.ims): KXsX[t_i, t_j] = self.kernel_func(t, task_i, t_j, task_j) KXsXs[t_i, t_i] = self.kernel_func(t, task_i, t, task_i) KXXinv = npla.inv(KXX + self.gpt_sigma ** 2 * np.eye(N)) pred_mean = np.dot(KXsX, np.dot(KXXinv, y)) pred_cov = KXsXs - np.dot(KXsX, np.dot(KXXinv, np.transpose(KXsX))) pred_sigma = np.sqrt(np.diag(pred_cov)) pred_ucb = pred_mean + self.gpt_kappa * pred_sigma best_ti = np.argmax(pred_ucb) task = tasks[best_ti] # store information for diagnosis. self.diagnostics['mean'] = {str(task): mean for (task, mean) in zip(tasks, pred_mean)} self.diagnostics['sigma'] = {str(task): sigma for (task, sigma) in zip(tasks, pred_sigma)} self.diagnostics['ucb'] = {str(task): ucb for (task, ucb) in zip(tasks, pred_ucb)} score_before = self.eval_func(task) self.train_func(task) score_after = self.eval_func(task) im = score_after - score_before self.diagnostics['chosen_task'] = str(task) self.diagnostics['im'] = im self.ims.append((task, im)) self.t += 1
def learn(self, num_iter=100, temperature=1., print_lag=None): for it in range(num_iter): dqn = self.dqn_mt bprop = self.bprop samples = prob.choice(self.experiences, self.minibatch_size, replace=True) # draw with replacement. # sample a minibatch. is_valids = [] probs = [] states = [] for idx, sample in enumerate(samples): # randomly choose a goal. goal = prob.choice(self.goals, 1)[0] dqn = self.dqn_by_goal[goal] state, action, next_state, reward, meta = sample valid_actions = meta['last_valid_actions'] num_actions = meta['num_actions'] raw_state = np.array(state['raw_state']) raw_state[1, goal[0], goal[1]] = 1. states.append(raw_state) is_valid = [1. for action in range(num_actions) if action in set(valid_actions)] is_valids.append(is_valid) prob_vec = dqn._get_softmax_action_distribution(raw_state, temperature=temperature, valid_actions=valid_actions) probs.append(prob_vec) states = np.array(states) probs = np.array(probs) is_valids = np.array(is_valids) score = self.bprop(states, probs, is_valids) if print_lag and print_lag > 0 and it % print_lag == 0: print 'iter = ', it, 'score = ', score
def learn(self, num_iter=10, print_lag=50, dqn_mt=None): for it in range(num_iter): for (goal, dqn) in self.dqn_by_goal.items(): bprop = self.bprop_by_goal[goal] samples = prob.choice(self.experiences, self.minibatch_size, replace=True) # draw with replacement. # sample a minibatch. states = [None] * self.minibatch_size next_states = [None] * self.minibatch_size actions = np.zeros(self.minibatch_size, dtype=int) rewards = np.zeros(self.minibatch_size) nvas = [] terminals = [] for idx, sample in enumerate(samples): state, action, next_state, reward, meta = sample nva = meta['curr_valid_actions'] states[idx] = np.array(state['raw_state']) states[idx][1, goal[0], goal[1]] = 1. actions[idx] = action reward = next_state['pos'][goal[0], goal[1]] # TODO: hack for gridworld. rewards[idx] = reward nvas.append(nva) next_states[idx] = np.array(next_state['raw_state']) next_states[idx][1, goal[0], goal[1]] = 1. if reward > 0.: terminals.append(idx) states = np.array(states) next_states = np.array(next_states) # learn through backpropagation. shared_values = dqn_mt.fprop(next_states)[range(len(actions)), actions] next_qvals = dqn.fprop(next_states) next_vs = np.zeros(self.minibatch_size) for idx in range(self.minibatch_size): if idx not in terminals: next_vs[idx] = np.max(next_qvals[idx, nvas[idx]]) targets = rewards + self.gamma * next_vs error = bprop(states, actions, targets.flatten(), shared_values) if print_lag and print_lag > 0 and it % print_lag == 0: print 'iter = ', it, 'error = ', error
def run(self, tasks, num_epochs=1): for ni in range(num_epochs): sub_tasks = prob.choice(tasks, size=self.num_sample, replace=False) ims = [] self.diagnostics['im_task'] = {} for task in sub_tasks: im = self._eval_im(task) ims.append(im) self.diagnostics['im_task'][task] = im max_ind = np.argmax(ims) chosen_task = sub_tasks[max_ind] self.diagnostics['chosen_task'] = str(chosen_task) self.train_func(self.learner, chosen_task)
def sample(self, **kwargs): if self.coord_data: (_, octopus_x, octopus_y) = prob.choice(self.coord_data, 1)[0] else: octopus_x = npr.randint(0, SCREEN_WIDTH) octopus_y = npr.randint(0, SCREEN_HEIGHT) task_id = os.path.join('octopus', str(octopus_x) + '_' + str(octopus_y)) absolute_task_path = os.path.join(LEVEL_PATH, task_id + '.txt') with open(absolute_task_path, 'w') as f: for (obj, x, y) in self.data: if obj == 'octopus': f.write(','.join([obj, str(octopus_x), str(octopus_y)]) + '\n') else: f.write(','.join([obj, str(x), str(y)]) + '\n') return OctopusTask(level=task_id, **kwargs)
def learn(self, experience_by_dqn, num_iter=100, temperature=1., print_lag=None): experiences = [] for (dqn, experience) in experience_by_dqn.items(): experiences.extend([(dqn, ex) for ex in experience]) for it in range(num_iter): bprop = self.bprop samples = prob.choice(self.experiences, self.minibatch_size, replace=True) # draw with replacement. # sample a minibatch. is_valids = [] targets = [] states = [] actions = np.zeros(self.minibatch_size, dtype=int) for idx, sample in enumerate(samples): (dqn, sample) = sample state, last_action, next_state, reward, meta = sample valid_actions = meta['last_valid_actions'] num_actions = meta['num_actions'] states.append(state) is_valid = [1. for action in range(num_actions) if action in set(valid_actions)] if self.loss == 'KL': target = dqn._get_softmax_action_distribution(state, temperature=temperature, valid_actions=valid_actions) elif self.loss == 'l2' or self.loss == 'l1' or self.loss == 'l1-exp': target = dqn.av(state) elif self.loss == 'l1-action': target = [dqn.av(state)[last_action]] is_valid = [is_valid[last_action]] is_valids.append(is_valid) targets.append(target) actions[idx] = last_action states = np.array(states) targets = np.array(targets) is_valids = np.array(is_valids) score = self.bprop(states, actions, targets, is_valids) if print_lag and print_lag > 0 and it % print_lag == 0: print 'iter = ', it, 'score = ', score
def _learn(self, next_state, reward, next_valid_actions): ''' need next_valid_actions to compute appropriate V = max_a Q(s', a). ''' self._add_to_experience(self.last_state, self.last_action, next_state, reward, next_valid_actions) samples = prob.choice(self.experience, self.minibatch_size, replace=True) # draw with replacement. for idx, sample in enumerate(samples): state, action, next_state, reward, nva = sample self.qfunc.table[state, action] *= (1 - self.alpha) if next_state is not None: self.qfunc.table[state, action] += self.alpha * (reward + self.gamma * np.max(self.qfunc.table[next_state, nva])) else: self.qfunc.table[state, action] += self.alpha * reward
def _on_screen_update(self, _, *args, **kwargs): self.total_frames += 1 is_end = self.is_end() if not is_end and (self.total_frames-1) % self.frames_per_action > 0: if self.callback: # TODO: callback on skip steps. now callback is only used for videos. self.callback() return score = self.get_score() reward = score - self.curr_score self.cum_reward += reward self.curr_score = score if self.state_type == 'pixel': self.curr_screen_rgb = pygame.surfarray.array3d(pygame.display.get_surface()) frame = self._get_frame() self.frames.append(frame) if len(self.frames) < self.num_frames: action = choice(self.valid_actions, 1)[0] else: if len(self.frames) > self.num_frames: self.frames = self.frames[-4:] curr_state = self._get_state() if self.callback: self.callback() if self.last_action != None: self.learner.send_feedback(reward, curr_state, self.valid_actions, is_end) if is_end: return action = self.learner.get_action(curr_state, self.valid_actions) self.total_steps += 1 self.last_action = action self._last_keys_pressed = self._keys_pressed self._keys_pressed = [self.valid_events[action]]
def step(self, actionid): assert(actionid >= 0 and actionid < self.num_actions) action = self.ACTIONS[actionid] if action == 'move eye to hand': self.state['eye_pos'] = self.state['hand_pos'] elif action == 'move eye to marker': self.state['eye_pos'] = self.state['mark_pos'] elif action == 'move eye north': if self.state['eye_pos'][0] > 0: self.state['eye_pos'][0] -= 1 elif action == 'move eye south': if self.state['eye_pos'][0] < self.size - 1: self.state['eye_pos'][0] += 1 elif action == 'move eye west': if self.state['eye_pos'][1] > 0: self.state['eye_pos'][1] -= 1 elif action == 'move eye east': if self.state['eye_pos'][1] < self.size - 1: self.state['eye_pos'][1] += 1 elif action == 'move eye to a random object': pos = prob.choice(self.object_pos, 1)[0] self.state['eye_pos'] = pos elif action == 'move hand to eye': self.state['hand_pos'] = self.state['eye_pos'] elif action == 'move marker to eye': self.state['mark_pos'] = self.state['eye_pos'] elif action == 'touch object' and self._can_touch_object(): if self.state['eye_pos'] == self.state['red_button_pos']: self.state['music'] = False elif self.state['eye_pos'] == self.state['blue_button_pos']: self.state['music'] = True elif self.state['eye_pos'] == self.state['switch_pos']: self.state['light'] = not self.state['light'] elif (self.state['eye_pos'] == self.state['ball_pos'] and (self.state['mark_pos'][0] == self.state['ball_pos'][0] or self.state['mark_pos'][1] == self.state['ball_pos'][1]) ): # kick the ball if ball and mark are on a straight line. self.state['ball_pos'] = self.state['mark_pos'] return 0.
def _get_uct_action(self, state_vector, uct, param_c, valid_actions, debug=False): init_count = 1. # initial count for all actions. action_values = {action: self.av(state_vector)[action] for action in valid_actions} uct_values = {action: uct.count_sa(state_vector, action) for action in valid_actions} uct_state_values = {action: uct.count_s(state_vector) for action in valid_actions} ucb = {action: action_values[action] + param_c * np.sqrt(np.log((len(valid_actions) * init_count + uct_state_values[action])) \ / (init_count + uct_values[action])) for action in valid_actions} max_val = -float('inf') max_actions = [] for (action, value) in ucb.items(): if value > max_val: max_val = value max_actions = [action] if value == max_val: max_actions.append(action) if debug: print 'action_values', action_values print 'uct_values', uct_values print 'uct_state_values', uct_state_values print 'ucb', ucb return prob.choice(max_actions, 1)[0]
def run(self, tasks, num_epochs=1): # set local variables. K = self.K all_settings = set() for task in tasks: all_settings.add(self.feat_func(task)) all_settings = list(all_settings) for task in tasks: if task not in self.task_score: self.task_score[task] = self.eval_func(task) for ni in range(num_epochs): im_pred = {} im_sigma = {} im_ucb = {} if len(self.task_im) < 1: # select task based on prior. if not self.init_setting: chosen_task = prob.choice(tasks, 1) else: chosen_task = self.sample_func(self.init_setting) else: # select task based on GP. im = [(self.feat_func(task), im) for (task, im) in self.task_im.items()] # use Gaussian Process to estimate potential function. N = len(im) KXX = np.zeros((N, N)) y = np.zeros(N) for (ti, (setting_i, im_i)) in enumerate(im): for (tj, (setting_j, im_j)) in enumerate(im): KXX[ti, tj] = self.kernel_func(setting_i, setting_j) for (ti, (setting_i, im_i)) in enumerate(im): y[ti] = im_i M = len(all_settings) KXsX = np.zeros((M, N)) KXsXs = np.zeros((M, M)) for (ti, setting_i) in enumerate(all_settings): for (tj, (setting_j, im_j)) in enumerate(im): KXsX[ti, tj] = self.kernel_func(setting_i, setting_j) KXsXs[ti, ti] = self.kernel_func(setting_i, setting_i) KXXinv = npla.inv(KXX + self.sigma_n**2 * np.eye(N)) pred_mean = np.dot(KXsX, np.dot(KXXinv, y)) pred_cov = KXsXs - np.dot(KXsX, np.dot(KXXinv, np.transpose(KXsX))) pred_sigma = np.sqrt(np.diag(pred_cov)) for (ti, setting) in enumerate(all_settings): im_pred[setting] = pred_mean[ti] im_sigma[setting] = pred_sigma[ti] im_ucb[setting] = pred_mean[ti] + self.eta * pred_sigma[ti] new_settings = sorted(all_settings, key=lambda setting: im_ucb[setting], reverse=True) new_setting = new_settings[0] chosen_task = self.sample_func(new_setting) self.train_func(chosen_task) for task in tasks: score = self.eval_func(task) self.task_im[task] = score - self.task_score[task] self.task_score[task] = score # collect diagnostics. self.diagnostics['task_im'] = self.task_im self.diagnostics['task_score'] = self.task_score self.diagnostics['pred'] = im_pred self.diagnostics['sigma'] = im_sigma self.diagnostics['ucb'] = im_ucb self.diagnostics['task'] = chosen_task self.diagnostics['setting'] = self.feat_func(chosen_task)
def run(self, tasks, num_epochs=1): if len(self.active_tasks) == 0: # initial round. # chose a set of active tasks uniformly at random. self.active_tasks = prob.choice(tasks, size=self.K, replace=True) # set local variables. active_tasks = self.active_tasks passive_tasks = self.passive_tasks K = self.K K0 = self.K0 K1 = self.K1 for ni in range(num_epochs): # compute old score if necessary. for task in active_tasks: if task not in self.task_score: self.task_score[task] = self.eval_func(task) # learn on each task. for task in active_tasks: self.train_func(task) if len(passive_tasks) >= K1: selected_passive_tasks = prob.choice(self.passive_tasks, size=K1, replace=False) for task in selected_passive_tasks: self.train_func(task) # evaluate improvement. im = {} for task in active_tasks: new_score = self.eval_func(task) im[task] = new_score - self.task_score[task] self.task_score[task] = new_score # create candidate set. candidate_set = set() for task in active_tasks: candidate_set = candidate_set.union(set(self.expand_func(task))) candidate_set = candidate_set.union(set(prob.choice(tasks, size=K0, replace=False))) candidate_set = list(candidate_set) new_tasks = candidate_set # use Gaussian Process to estimate potential function. N = len(im) KXX = np.zeros((N, N)) y = np.zeros(N) for (ti, (task_i, im_i)) in enumerate(im.items()): for (tj, (task_j, im_j)) in enumerate(im.items()): KXX[ti, tj] = self.kernel_func(task_i, task_j) for (ti, (task_i, im_i)) in enumerate(im.items()): y[ti] = im_i M = len(new_tasks) KXsX = np.zeros((M, N)) KXsXs = np.zeros((M, M)) for (ti, task_i) in enumerate(new_tasks): for (tj, (task_j, im_j)) in enumerate(im.items()): KXsX[ti, tj] = self.kernel_func(task_i, task_j) KXsXs[ti, ti] = self.kernel_func(task_i, task_i) KXXinv = npla.inv(KXX + self.sigma_n**2 * np.eye(N)) pred_mean = np.dot(KXsX, np.dot(KXXinv, y)) pred_cov = KXsXs - np.dot(KXsX, np.dot(KXXinv, np.transpose(KXsX))) pred_sigma = np.sqrt(np.diag(pred_cov)) im_pred = {} im_sigma = {} im_ucb = {} for (ti, task) in enumerate(new_tasks): im_pred[task] = pred_mean[ti] im_sigma[task] = pred_sigma[ti] im_ucb[task] = pred_mean[ti] + self.eta * pred_sigma[ti] new_tasks = sorted(new_tasks, key=lambda task: im_ucb[task], reverse=True) new_tasks_selected = new_tasks[:K] self.passive_task = self.passive_tasks.union(self.active_tasks).difference(new_tasks_selected) self.active_tasks = new_tasks_selected # collect diagnostics. self.diagnostics['im'] = im self.diagnostics['pred'] = im_pred self.diagnostics['sigma'] = im_sigma self.diagnostics['ucb'] = im_ucb self.diagnostics['score'] = self.task_score self.diagnostics['new-tasks'] = new_tasks self.diagnostics['new-tasks-selected'] = new_tasks_selected self.diagnostics['active-tasks'] = active_tasks
def get_action(self, state, valid_actions, **kwargs): action = prob.choice(valid_actions, 1)[0] return action
def generate_experience_mt(policy, tasks, budget_experience, budget_per_episode=None, state_attr='curr_state'): experiences = [] while len(experiences) < budget_experience: task = prob.choice(tasks, 1)[0] experiences.extend(generate_experience(policy, task, budget_experience - len(experiences), budget_per_episode, budget_episodes=1, state_attr=state_attr)) return experiences
def run(self, tasks, num_epochs=1): # set local variables. K = self.K K0 = self.K0 # initial round. if len(self.active_tasks) == 0: # chose a set of active tasks uniformly at random. if self.init_tasks: self.active_tasks = set(prob.choice(self.init_tasks, size=K0, replace=False)) else: self.active_tasks = set(prob.choice(tasks, size=K0, replace=False)) for ni in range(num_epochs): active_tasks = self.active_tasks curr_tasks = active_tasks # compute old score if necessary. for task in curr_tasks: if task not in self.task_score: self.task_score[task] = self.eval_func(task) # learn on each task. for task in active_tasks: self.train_func(task) # evaluate improvement. im = {} for task in curr_tasks: new_score = self.eval_func(task) improvement = new_score - self.task_score[task] self.task_score[task] = new_score if task not in self.im_mem: self.im_mem[task] = [] self.im_mem[task].append((self.time, improvement)) im[task] = np.mean([i for (t, i) in self.im_mem[task] if self.time-t <= 3]) # create candidate set. new_tasks = tasks if len(new_tasks) == 0: print 'WARNING: new tasks is empty in GP' # use Gaussian Process to estimate potential function. N = len(im) KXX = np.zeros((N, N)) y = np.zeros(N) for (ti, (task_i, im_i)) in enumerate(im.items()): for (tj, (task_j, im_j)) in enumerate(im.items()): KXX[ti, tj] = self.kernel_func(task_i, task_j) for (ti, (task_i, im_i)) in enumerate(im.items()): y[ti] = im_i M = len(new_tasks) KXsX = np.zeros((M, N)) KXsXs = np.zeros((M, M)) for (ti, task_i) in enumerate(new_tasks): for (tj, (task_j, im_j)) in enumerate(im.items()): KXsX[ti, tj] = self.kernel_func(task_i, task_j) KXsXs[ti, ti] = self.kernel_func(task_i, task_i) KXXinv = npla.inv(KXX + self.sigma_n**2 * np.eye(N)) pred_mean = np.dot(KXsX, np.dot(KXXinv, y)) pred_cov = KXsXs - np.dot(KXsX, np.dot(KXXinv, np.transpose(KXsX))) pred_sigma = np.sqrt(np.diag(pred_cov)) im_pred = {} im_sigma = {} im_ucb = {} for (ti, task) in enumerate(new_tasks): im_pred[task] = pred_mean[ti] im_sigma[task] = pred_sigma[ti] im_ucb[task] = pred_mean[ti] + self.eta * pred_sigma[ti] new_tasks = sorted(new_tasks, key=lambda task: im_ucb[task], reverse=True) new_tasks_selected = new_tasks[:K] self.active_tasks = self.active_tasks.union(set(new_tasks_selected)) self.time += 1 # collect diagnostics. self.diagnostics['im'] = im self.diagnostics['pred'] = im_pred self.diagnostics['sigma'] = im_sigma self.diagnostics['ucb'] = im_ucb self.diagnostics['score'] = self.task_score self.diagnostics['new-tasks'] = new_tasks self.diagnostics['new-tasks-selected'] = new_tasks_selected self.diagnostics['active-tasks'] = self.active_tasks
def _update_net(self): ''' sample from the memory dataset and perform gradient descent on (target - Q(s, a))^2 ''' # don't update the network until sufficient experience has been # accumulated # removing this might cause correlation for early samples. suggested to be used in curriculums. if self.total_exp < self.skip_frame: return if self.total_exp % self.update_freq: return #if len(self.experience) < self.memory_size: # return for nn_bi in range(self.nn_num_batch): states = [None] * self.minibatch_size next_states = [None] * self.minibatch_size actions = np.zeros(self.minibatch_size, dtype=int) rewards = np.zeros(self.minibatch_size) nvas = [] # sample and process minibatch # samples = random.sample(self.experience, self.minibatch_size) # draw without replacement. samples = prob.choice(self.experience, self.minibatch_size, replace=True) # draw with replacement. terminals = [] for idx, sample in enumerate(samples): state, action, next_state, reward, meta = sample nva = meta['next_valid_actions'] states[idx] = state actions[idx] = action rewards[idx] = reward nvas.append(nva) if next_state is not None: next_states[idx] = next_state else: next_states[idx] = state terminals.append(idx) # convert states into tensor. states = np.array(states).astype(floatX) next_states = np.array(next_states).astype(floatX) # compute target reward + \gamma max_{a'} Q(ns, a') # Ensure target = reward when NEXT_STATE is terminal if self.target_freq > 0: next_qvals = self.dqn_frozen.fprop(next_states) else: next_qvals = self.dqn.fprop(next_states) use_DDQN = False next_vs = np.zeros(self.minibatch_size).astype(floatX) if use_DDQN: # double DQN. next_qvals_unfrozen = self.dqn.fprop(next_states) for idx in range(self.minibatch_size): if idx not in terminals: next_action_index = np.argmax(next_qvals_unfrozen[idx, nvas[idx]]) next_vs[idx] = next_qvals[idx, nvas[idx][next_action_index]] else: for idx in range(self.minibatch_size): if idx not in terminals: next_vs[idx] = np.max(next_qvals[idx, nvas[idx]]) targets = rewards + self.gamma * next_vs #if (targets > 100.).any(): # print 'error, target > 1', targets # print 'rewards', rewards # print 'next_vs', next_vs # using regularization. reg_vs = [] reg = self.regularizer.get('dqn-q') if reg: dqn = reg['dqn'] #dqn_avs = dqn.fprop(states) dqn_avs = self.dqn_frozen.fprop(states) #dqn_avs = next_qvals #for idx in range(self.minibatch_size): # if idx not in terminals: # dqn_avs[idx, :] = 0. reg_vs.append(dqn_avs) ## diagnostics. #print 'targets', targets #print 'next_qvals', next_qvals #print 'pure prop', self.dqn.fprop(states) #print 'prop', self.dqn.fprop(states)[range(states.shape[0]), actions] #print 'actions', actions nn_error = [] for nn_it in range(self.nn_num_iter): if debug_flag and self.target_freq and self.total_exp % self.target_freq == 0: print 'value before\n', self.dqn.fprop(states)[range(self.minibatch_size), actions] error = self.bprop(states, actions, targets.flatten(), *reg_vs) if debug_flag and self.target_freq and self.total_exp % self.target_freq == 0: print 'nn_it', nn_it, 'error', error print 'value after\n', self.dqn.fprop(states)[range(self.minibatch_size), actions] print 'targets\n', targets #print 'dqn vs\n', self.dqn.fprop(states) #print 'dqn avs\n', dqn_avs print 'next_qvals\n', next_qvals print 'rewards', rewards print 'total_exp', self.total_exp nn_error.append(float(error)) self.diagnostics['nn-error'].append(nn_error)
def get_action(self, state, valid_actions=None): if not valid_actions: valid_actions = range(self.num_actions) action = prob.choice(valid_actions, 1)[0] return action
from pyrl.tasks.pyale import PythonGame from pyrl.tasks.pyale.pong import PongGame from pyrl.utils import Timer from pyrl.visualize.visualize import * from pyrl.prob import choice game = PongGame() with Timer('valid actions'): for it in range(100): print 'valid_actions', game.valid_actions vr = RawVideoRecorder('video.m4v', (640, 480)) for it in range(100): action = choice(range(game.num_actions), 1)[0] reward = game.step(action) print 'state', game.curr_state print 'is_end', game.is_end() #vr.write_frame(game.visualize_raw()) print 'action', action, 'reward', reward vr.stop()
def callback(task): imgbuf = StringIO() task.visualize(fig=1, fname="__cache__.jpg", format="jpg") with open("__cache__.jpg", "rb") as imgbuf: data = imgbuf.read() vr.write_frame(data) game = AtariGame("data/roms/pong.bin", live=True, skip_frame=65) # plt.imshow(game._curr_frame, cmap='Greys_r', interpolation='none') # plt.show() vr = VideoRecorder("video.m4v") buf = StringIO() count = 0 while not game.is_end(): count += 1 a = choice(game.valid_actions, 1)[0] print "action", a print "game", game.valid_actions a = 12 # game.visualize(fig=1, fname=buf, format='jpg') print game.curr_state.shape game.step(a) callback(game) vr.stop()
def run(self, task, num_episodes=100, num_steps=float('inf'), tol=1e-4, debug=False): ''' update qval every *num_epoch* for every *num_epoch*, run *num_episodes* of MCTS. ''' cum_rewards = [] total_steps = 0. for ei in range(num_episodes): count_steps = 0. cum_reward = 0. factor = 1. history = [] phase_expansion = False task.reset() while True: if total_steps > num_steps or count_steps >= np.log(tol) / np.log(self.gamma) or task.is_end(): self.backprop(history) break curr_state = task.curr_state meta = {} unvisited_actions = [action for action in task.valid_actions if self.qval.get(curr_state, action) == None] if not phase_expansion and unvisited_actions: # can we switch back to qval if unvisited is empty? phase_expansion = True action = prob.choice(unvisited_actions, 1)[0] meta['phase'] = 'selection' elif phase_expansion: # expand. meta['phase'] = 'expansion' if self.default_policy == 'random': action = self.random_policy.get_action(curr_state, valid_actions=task.valid_actions) elif self.default_policy == 'rb-eps': action = self.rb.get_action(curr_state, valid_actions=task.valid_actions, method='eps-greedy', epsilon=0.05) else: # select. meta['phase'] = 'selection' action = self.qval.get_action(curr_state, valid_actions=task.valid_actions, method='uct', uct=self.uct, param_c=self.param_c, debug=False) # action = self.qval.get_action(curr_state, valid_actions=task.valid_actions, method='eps-greedy', epsilon=0.05) meta['valid_actions'] = task.valid_actions reward = task.step(action) cum_reward = cum_reward + factor * reward factor *= self.gamma history.append((curr_state, action, reward, meta)) count_steps += 1 total_steps += 1 self.total_exp += 1 cum_rewards.append(cum_reward) if total_steps > num_steps: break task.reset() print 'ei', ei print 'cum', cum_rewards return np.mean(cum_rewards)
def update_net(self, num_iter=1): ''' sample from the memory dataset and perform gradient descent on (target - Q(s, a))^2 ''' #if self.total_exp_by_task[task] < self.memory_size: # return # merge experience buffer. experience = [] for task in self.ex_task: experience.extend(self.ex_task[task]) errors = [] for it in range(num_iter): # don't update the network until sufficient experience has been # accumulated states = [None] * self.minibatch_size next_states = [None] * self.minibatch_size actions = np.zeros(self.minibatch_size, dtype=int) rewards = np.zeros(self.minibatch_size) nvas = [] # sample and process minibatch # samples = random.sample(self.experience, self.minibatch_size) # draw without replacement. samples = prob.choice(experience, self.minibatch_size, replace=True) # draw with replacement. terminals = [] for idx, sample in enumerate(samples): state, action, next_state, reward, nva = sample states[idx] = state actions[idx] = action rewards[idx] = reward nvas.append(nva) if next_state is not None: next_states[idx] = next_state else: next_states[idx] = state terminals.append(idx) # convert states into tensor. states = np.array(states) next_states = np.array(next_states) # compute target reward + \gamma max_{a'} Q(ns, a') # Ensure target = reward when NEXT_STATE is terminal next_qvals = self.dqn.fprop(next_states) next_vs = np.zeros(self.minibatch_size) for idx in range(self.minibatch_size): if idx not in terminals: next_vs[idx] = np.max(next_qvals[idx, nvas[idx]]) targets = rewards + self.gamma * next_vs ## diagnostics. #print 'targets', targets #print 'next_qvals', next_qvals #print 'pure prop', self.dqn.fprop(states) #print 'prop', self.dqn.fprop(states)[range(states.shape[0]), actions] #print 'actions', actions #for it in range(10): error = self.bprop(states, actions, targets.flatten()) errors.append(error) #print 'it', it, 'error', error return np.mean(errors)
def run(self, num_epochs=1, num_episodes=1): cov_func = lambda task1, task2, t1, t2: self.gpt_v * np.exp(- (self.dist(task1, task2) ** 2 * self.gpt_r + self.gpt_eta * (t1 - t2) ** 2)) for ei in range(num_epochs): # task selection. # complexity max(#task * history, history ** 2.3) if len(self.examples) == 0: # no prior experience, choose randomly. task = prob.choice(self.tasks, 1)[0] else: # GP-t. mu = np.zeros(self.num_tasks) sigma = np.zeros(self.num_tasks) ucb = np.zeros(self.num_tasks) # Kinv = npla.inv(self.K + self.gpt_sigma ** 2) # Kinv_y = np.dot(Kinv, self.y) Kinv_y = npla.solve(self.K + np.eye(self.t) * self.gpt_sigma ** 2, self.y) for ti, task in enumerate(self.tasks): vec = np.zeros(self.t) for ei in range(self.t): (t_ei, task_ei, _) = self.examples[ei] vec[ei] = cov_func(task, task_ei, self.t, t_ei) mu[ti] = np.dot(vec, Kinv_y) Kinv_vec = npla.solve(self.K + np.eye(self.t) * self.gpt_sigma ** 2, vec) sigma[ti] = self.gpt_v + self.gpt_sigma ** 2 - np.dot(vec, Kinv_vec) ucb[ti] = mu[ti] + self.gpt_kappa * sigma[ti] best_ti = np.argmax(ucb) task = self.tasks[best_ti] # store information for diagnosis. self.mu = mu self.sigma = sigma self.ucb = ucb # import pdb; pdb.set_trace() # run training. self._run_task(task, num_episodes=num_episodes) # evaluate performance. self.last_task_performance = np.zeros(self.num_tasks) for ti in range(self.num_tasks): self.last_task_performance[ti] = expected_reward_tabular_normalized(self.dqn, self.tasks[ti], tol=1e-4) performance = np.mean(self.last_task_performance) progress = performance - self.last_performance # update statistics. self.examples.append((self.t, task, progress)) self.t += 1 t = self.t new_K = np.zeros((t, t)) new_y = np.zeros(t) if t > 1: new_K[:t - 1, :t - 1] = self.K new_y[:t - 1] = self.y new_K[t - 1, t - 1] = self.gpt_v new_y[t - 1] = progress for ei in range(t - 1): (t_ei, task_ei, _) = self.examples[ei] new_K[t - 1, ei] = cov_func(task_ei, task, t_ei, t - 1) new_K[ei, t - 1] = new_K[t - 1, ei] # symmetric. self.K = new_K self.y = new_y self.last_performance = performance self.last_progress = progress self.last_task = task self.last_task_ti = self.tasks.index(task)
def get_action(self, curr_state, valid_actions): action = choice(valid_actions, 1)[0] return action
def _get_relaxation_action(self, state_vector, dqn, uct, param_c, valid_actions, strategy='wa-state', debug=False): init_count = 1. # initial count for all actions. action_values = {action: self.av(state_vector)[action] for action in valid_actions} uct_values = {action: uct.count_sa(state_vector, action) for action in valid_actions} uct_state_values = {action: uct.count_s(state_vector) for action in valid_actions} # ucb = upper confidence bound. ucb = {action: action_values[action] + param_c * np.sqrt(np.log((len(valid_actions) * init_count + uct_state_values[action])) \ / (init_count + uct_values[action])) for action in valid_actions} # rb = relaxation bound. rb = {action: dqn.av(state_vector)[action] for action in valid_actions} print 'strategy', strategy # just use rb. if strategy == 'rb': finalb = rb # just use av. if strategy == 'av': finalb = action_values # min of upper bounds. if strategy == 'ucb-rb': finalb = {action: min(ucb[action], rb[action]) for action in valid_actions} #thres = 10 #finalb = {action: ucb[action] if uct.count_sa(state_vector, action) > thres else rb[action] # for action in valid_actions} #thres = 10 #finalb = {action: ucb[action] if uct.count_s(state_vector) > thres else rb[action] # for action in valid_actions} # weighted average. if strategy == 'wa-state': ratio = 1. / (1. + uct.count_s(state_vector)) finalb = {action: ucb[action] * (1 - ratio) + rb[action] * ratio for action in valid_actions} # weighted average by state action. if strategy == 'wa': finalb = {} for action in valid_actions: ratio = 1. / (1. + uct.count_sa(state_vector, action)) finalb[action] = action_values[action] * (1 - ratio) + rb[action] * ratio # duality-gap if strategy == 'duality-gap': gap2 = sum([(rb[action] - action_values[action]) **2 for action in valid_actions]) / len(valid_actions) ratio = max(0, 1 - np.std(rb.values()) **2 / gap2 / uct.count_s(state_vector)) finalb = {action: ucb[action] * (1 - ratio) + rb[action] * ratio for action in valid_actions} if debug: print 'std of relaxation', np.std(rb.values()) print 'mean gap', np.sqrt(gap2) print 'ratio', ratio # finalb = ucb # choose action. max_val = -float('inf') max_actions = [] for (action, value) in finalb.items(): if value > max_val: max_val = value max_actions = [action] if value == max_val: max_actions.append(action) if debug: print 'action_values', action_values print 'uct_values', uct_values print 'uct_state_values', uct_state_values print 'ucb', ucb print 'rb', rb print 'finalb', finalb return prob.choice(max_actions, 1)[0]