def do_user_clustering(self): self.compute_user_averages() self.select_U_star() self.eps_users = self.compute_eps_users() # find neighborhoods Q = [0 for u in range(self.M)] for u in range(self.M): Q[u] = set() for v in range(self.M): if np.linalg.norm(self.rho_users[u, :] - self.rho_users[v, :]) <= self.eps_users: Q[u].add(v) Qprev = set() for l in range(2): cardinalities = [0 for y in range(self.M)] for u in range(self.M): cardinalities[u] = len(Q[u] - Qprev) u_l = rand_argmax(np.array(cardinalities)) self.P_kl_user[:, l] = np.transpose(self.rho_users[u_l, :]) Qprev = Qprev.union(Q[u_l]) for l in range(2): self.argmax_k[l] = rand_argmax(self.P_kl_user[:, l])
def choose_item(self, state): (self.u_current, self.Xi_current, self.prev_reward) = state Xi_u = self.Xi_current[:, self.u_current] assert len(Xi_u) == self.N #update the reward sum self.update_reward_sum() #update empirical average self.update_emprical_average() #update KLUCB index self.update_KLUCB_index() KLUCB_index_u = self.KLUCB_indexes # remove items that are not recommendable to the user are removed for i in range(self.N): if Xi_u[i] == 1: KLUCB_index_u[i] = 0 # choose item based on modified KLUCB indexes self.item_selected = rand_argmax(KLUCB_index_u) #update the counter self.item_cnt[ self.item_selected] = self.item_cnt[self.item_selected] + 1 # record previously used item self.item_prev = self.item_selected # increment time self.t = self.t + 1 return self.item_selected
def exploitation(self): k = self.compute_UCB_ind() if k == 1 or k == 2: # compute recommendable items from S to current user recommendable_from_I_k = np.multiply( np.array(self.Xi_u) == 0, np.array(self.I) == k) if sum(recommendable_from_I_k) > 0: self.item_selected = rand_argmax(recommendable_from_I_k) else: self.item_selected = rand_argmin(self.Xi_u) else: # do a recommendations in a round robbins manner if self.k_prev[self.u_current] == 1: # recom from 2 k = 2 self.k_prev[self.u_current] = 2 recommendable_from_I_k = np.multiply( np.array(self.Xi_u) == 0, np.array(self.I) == k) if sum(recommendable_from_I_k) > 0: self.item_selected = rand_argmax(recommendable_from_I_k) else: self.item_selected = rand_argmin(self.Xi_u) #print('random sampling (exploi)') else: # recom from 1 k = 1 self.k_prev[self.u_current] = 1 recommendable_from_I_k = np.multiply( np.array(self.Xi_u) == 0, np.array(self.I) == k) if sum(recommendable_from_I_k) > 0: self.item_selected = rand_argmax(recommendable_from_I_k) else: self.item_selected = rand_argmin(self.Xi_u) #print('random sampling (exploi)') #update the counter self.item_cnt[ self.item_selected] = self.item_cnt[self.item_selected] + 1 assert self.Xi_current[self.item_selected, self.u_current] == 0
def do_clustering(self): averages = self.emprical_average() averages = averages + (self.I_0 == 0) * self.LARGE_CONST # find neighborhoods Q = [0 for i in range(self.N)] for i in range(self.N): Q[i] = set() if self.I_0[i] == 1: for j in range(self.N): if self.I_0[j] == 1: if abs(averages[i] - averages[j]) <= self.epsilon_alg: Q[i].add(j) M = set() Qprev = set() #find centers of clusters for k in range(self.K): cardinalities = [0 for i in range(self.N)] for i in range(self.N): if self.I_0[i] == 1: cardinalities[i] = len(Q[i] - Qprev) i = rand_argmax(np.array(cardinalities)) M.add(i) Qprev = Qprev.union(Q[i]) # initialize with negative value average_cluster_tmp = [-self.LARGE_CONST for i in range(self.N)] # put empirical averages of the elements of M for m in M: average_cluster_tmp[m] = averages[m] average_cluster_tmp = np.array(average_cluster_tmp) # reorder the cluster_average (in a discreasing order in k) for k in range(self.K): i = rand_argmax(average_cluster_tmp) assert i in M self.cluster_average[k] = average_cluster_tmp[i] M = M - {i} average_cluster_tmp[i] = -self.LARGE_CONST
def exploitation(self): # update I_1 self.update_V() # compute recommendable items from V to current user activeitems = np.multiply(self.Xi_u == 0, np.array(self.V) == 1) if sum(activeitems) > 0: # compute emirical averages averages = self.emprical_average() # keep only for the active items. (otherwise element is -1) for i in range(self.N): if activeitems[i] == 0: averages[i] = -1 # choose the best (empirical average) item in V self.item_selected = rand_argmax(averages) assert self.Xi_current[self.item_selected, self.u_current] == 0 else: # random sampling from V_0^c when there are no items from V that can be recommended to current user # Recompute activeitems, recommendable and in V_0^c activeitems = np.multiply(self.Xi_u == 0, np.array(self.V_0) == 0) if sum(activeitems) == 0: self.item_selected = rand_argmin(self.Xi_u) else: # select item unifromly at random from activeitems self.item_selected = rand_argmax(activeitems) # add the item to V and V_0 self.V[self.item_selected] = 1 self.V_0[self.item_selected] = 1 if self.Xi_current[self.item_selected, self.u_current] == 1: from IPython.core.debugger import Pdb Pdb().set_trace() # update the counter self.item_cnt[ self.item_selected] = self.item_cnt[self.item_selected] + 1
def select_U_star(self): U_0_tmp = self.U_0.copy() for ind in range(self.s_u_star): cnt_users_star = np.zeros(self.M) for u in range(self.M): cnt_users_star[u] = np.sum(np.array( self.Xi_current[:, u])) * U_0_tmp[u] - 1 * np.array( U_0_tmp[u] == 0) new_u = rand_argmax(cnt_users_star) self.U_star[new_u] = 1 U_0_tmp[new_u] = 0
def additional_sampling(self): S_0 = np.ceil(1 / self.Delta_0**2) item_cnt_tmp = self.compute_cnt_for_active_items() # if item is changed, put the previous item into I_0 if self.item_prev != self.item_selected: self.I_0[self.item_prev] = 1 imax = rand_argmax(item_cnt_tmp) if self.Xi_current[imax, self.u_current] != 0: from IPython.core.debugger import Pdb Pdb().set_trace() assert self.Xi_current[imax, self.u_current] == 0 while (item_cnt_tmp[imax] == self.M): print('M reached. change item') self.I_0[imax] = 1 item_cnt_tmp = self.item_cnt - 2 * self.M * self.I_0 imax = rand_argmax(item_cnt_tmp) averages = self.emprical_average() if item_cnt_tmp[imax] >= S_0 and abs( averages[imax] - max(self.cluster_average)) >= self.Delta_0: self.I_0[imax] = 1 item_cnt_tmp = self.compute_cnt_for_active_items() imax = rand_argmax(item_cnt_tmp) assert self.Xi_current[imax, self.u_current] == 0 self.item_selected = imax # record the number of observations self.item_cnt[imax] = self.item_cnt[imax] + 1 # indicate to record the reward self.reward_rec_flag = 1
def exploitation(self): # compute user's tendency if none, return 0 A_u = np.multiply( np.array(self.Xi_u) == 0, np.array(self.hatA[:, self.u_current])) - np.array( self.LARGE_CONSTANT * self.maxhatA * np.array(self.Xi_u) == 1) self.item_selected = rand_argmax(A_u) # if we cannot select good item, force to do a random sampling. if self.Xi_current[self.item_selected, self.u_current] == 1: self.item_selected = rand_argmin(self.Xi_u) #update the counter self.item_cnt[ self.item_selected] = self.item_cnt[self.item_selected] + 1 assert self.Xi_current[self.item_selected, self.u_current] == 0
def choose_item(self, state): (self.u_current, self.Xi_current, self.prev_reward) = state Xi_u = self.Xi_current[:, self.u_current] assert len(Xi_u) == self.N if self.t == 1: #select T/m log T items self.random_I_0_selection() #update the reward sum self.update_reward_sum() #update emirical average self.update_emprical_average() #update KLUCB index self.update_KLUCB_index() KLUCB_index_u = self.KLUCB_indexes # remove items that are not recommendable to the user are removed for i in range(self.N): if Xi_u[i] == 1: KLUCB_index_u[i] = 0 # choose item based on modified KLUCB indexes if sum(KLUCB_index_u) > 0: # select item in I_0 with largest KLUCB index self.item_selected = rand_argmax(KLUCB_index_u) else: #random item selections self.item_selected = rand_argmin(Xi_u) #update the counter self.item_cnt[ self.item_selected] = self.item_cnt[self.item_selected] + 1 # record previously used item self.item_prev = self.item_selected self.t = self.t + 1 return self.item_selected
def explorations(self): #remove the previously recommended items for the current user from the candidate #candidates: element that is previously recommended items or not in S is made 0, othewise elements are 1 (recommendable) candidates = np.multiply(np.array(self.Xi_u == 0), np.array(np.array(self.S) == 1)) # If there are recommendable item from S, if sum(candidates) > 0: # select item from candidates, self.item_selected = rand_argmax(candidates) assert self.Xi_current[self.item_selected, self.u_current] == 0 else: # do a random sampling from a new item self.item_selected = rand_argmin(self.Xi_u) assert self.Xi_current[self.item_selected, self.u_current] == 0 print('random sampling') #update the counter self.item_cnt[ self.item_selected] = self.item_cnt[self.item_selected] + 1
def Tabular_q(env, episodes, num_act, episode_length=np.inf, epsilon=0.05, alpha=lambda v, t: 0.1, gamma=0.99, eval_interval=np.inf, Qs=None, init=0, soft_end=False, Q_trafo=lambda x: x): # Q-lerning. Returns Q-values as a dict. # Alpha is a map from visit count and the elapsed time to the learning rate. eval_interval determines # after how many episodes the greedy policy is evaluated and the return printed. Qs allows for the initialization # of Q-values with a dictionary. If Qs is None, init allows for constant initialization at the value init. # soft end determines, how terminal states are treated. If soft_end is true, transitions to terminal states still # update on the Q-value of the next state. Q_trafo is the scalarization function that determines the action # selection in the multi-objective case. vs = {} if Qs is None: Qs = {} else: Qs = deepcopy(Qs) for i in range(episodes): obs_new = env.reset() if obs_new not in Qs.keys(): Qs[obs_new] = [init for i in range(num_act)] if obs_new not in vs.keys(): vs[obs_new] = [init for i in range(num_act)] done = False t = 0 while done is False and t < episode_length: if obs_new not in Qs.keys(): Qs[obs_new] = [init for i in range(num_act)] if obs_new not in vs.keys(): vs[obs_new] = [init for i in range(num_act)] if np.random.uniform() > epsilon: act_new = rand_argmax(Q_trafo(Qs[obs_new])) else: act_new = np.random.choice(np.arange(num_act)) if t > 0: error = (rew + gamma * Qs[obs_new][np.argmax(Q_trafo(Qs[obs_new]))] - Qs[obs][act]) Qs[obs][act] = Qs[obs][act] + alpha(vs[obs][act], t) * error vs[obs][act] = vs[obs][act] + 1 obs = obs_new act = act_new if hasattr(env, 'dynamic') and env.dynamic is True: obs_new, rew, done, _ = env.step(act, Qs) else: obs_new, rew, done, _ = env.step(act) if done is True: if soft_end is False: error = (rew - Qs[obs][act]) Qs[obs][act] = Qs[obs][act] + alpha(vs[obs][act], t) * error vs[obs][act] = vs[obs][act] + 1 else: if obs_new not in Qs.keys(): Qs[obs_new] = [init for i in range(num_act)] error = ( rew + gamma * Qs[obs_new][np.argmax(Q_trafo(Qs[obs_new]))] - Qs[obs][act]) Qs[obs][act] = Qs[obs][act] + alpha(vs[obs][act], t) * error vs[obs][act] = vs[obs][act] + 1 t = t + 1 if i % eval_interval == (-1) % eval_interval: obs = env.reset() total = 0 if obs not in Qs.keys(): Qs[obs] = [init for i in range(num_act)] done = False s = 0 while done is False and s < episode_length: act = rand_argmax(Q_trafo(Qs[obs])) obs_new, rew, done, _ = env.step(act) if obs_new not in Qs.keys(): Qs[obs_new] = [init for i in range(num_act)] obs = obs_new total = total + rew s = s + 1 print(i, total) return Qs
def do_clustering(self): A = self.generate_A() # adj matrix (s times s) p_tilde = 2 * np.sum(A) / self.s / (self.s - 1) A_low = lowrank_approx(A, self.K) # rank-2 approximation r_t = [0 for ind in range(int(np.floor(np.log(self.s))))] for ind in range(int(np.floor(np.log(self.s)))): # find neighborhoods Q = [0 for i in range(self.s)] for i in range(self.s): Q[i] = set() for j in range(self.s): if np.linalg.norm(A_low[i] - A_low[j])**2 <= ( ind + 1) * p_tilde * self.epsilon_alg: Q[i].add(j) T = [0 for i in range(self.K)] xi = np.zeros((self.K, self.s)) Qprev = set() for k in range(self.K): cardinalities = [0 for i in range(self.s)] for i in range(self.s): cardinalities[i] = len(Q[i] - Qprev) # compute the index v_k^\star v_k = rand_argmax(np.array(cardinalities)) T[k] = Q[v_k] - Qprev Qprev = Qprev.union(Q[v_k]) for i in range(self.s): if i in T[k]: xi[k] = xi[k] + A_low[i] / len(T[k]) # remaining items assignment if len(Qprev) != self.s: for v in set(range(self.s)) - Qprev: distances = np.zeros(self.K) for k in range(self.K): distances[k] = np.linalg.norm(A_low[v] - xi[k])**2 k_star = rand_argmax(distances) T[k_star].add(v) #compute r_t for k in range(self.K): for i in range(self.s): if i in T[k]: r_t[ind] = r_t[ind] + np.linalg.norm(A_low[v] - xi[k])**2 #end for ind... minind = rand_argmin(np.array(r_t)) ind = minind # do a clustering with a smallerst error # do a clustering again with minind # find neighborhoods Q = [0 for i in range(self.s)] for i in range(self.s): Q[i] = set() for j in range(self.s): if np.linalg.norm(A_low[i] - A_low[j])**2 <= ( ind + 1) * p_tilde * self.epsilon_alg: Q[i].add(j) T = [0 for i in range(self.K)] xi = np.zeros((self.K, self.s)) Qprev = set() for k in range(self.K): cardinalities = [0 for i in range(self.s)] for i in range(self.s): cardinalities[i] = len(Q[i] - Qprev) # compute the index v_k^\star v_k = rand_argmax(np.array(cardinalities)) T[k] = Q[v_k] - Qprev Qprev = Qprev.union(Q[v_k]) for i in range(self.s): if i in T[k]: xi[k] = xi[k] + A_low[i] / len(T[k]) # remaining items assignment if len(Qprev) != self.s: #from IPython.core.debugger import Pdb; Pdb().set_trace() for v in set(range(self.s)) - Qprev: distances = np.zeros(self.K) for k in range(self.K): distances[k] = np.linalg.norm(A_low[v] - xi[k])**2 k_star = rand_argmin(distances) T[k_star].add(v) for k in range(self.K): for i in T[k]: self.I_S[i] = k + 1 for i in range(self.N): if self.S[i] == 1: self.I[i] = self.I_S[self.N_to_S[i]] # for the debug err_num = 0 for i in range(self.N): if i <= int(self.N / 2 - 1): if self.I[i] == 2: err_num = err_num + 1 if i > int(self.N / 2 - 1): if self.I[i] == 1: err_num = err_num + 1 err_rate = min(err_num / self.s, 1 - err_num / self.s) print('err_rate after SC=', end="") print(err_rate) #estimation of \hat{p}(i, j) for i in range(self.K): for j in range(self.K): numerator = 0 for v in T[i]: for u in T[j]: numerator = numerator + A[v, u] denominator = len(T[i]) * self.s self.P_kl[i, j] = numerator / denominator # local improvement S = [0 for i in range(self.K)] Sprev = [0 for i in range(self.K)] for k in range(self.K): Sprev[k] = T[k] for ind in range(int(np.floor(np.log(self.s)))): for k in range(self.K): S[k] = set() for v in range(self.s): # computation of likelihood likelihoods = np.zeros(self.K) for i in range(self.K): # sum up over all k wegihtsum = 0 psum = 0 for k in range(self.K): weight_by_Avw = 0 for w in Sprev[i]: weight_by_Avw = weight_by_Avw + A[v, w] wegihtsum = wegihtsum + weight_by_Avw psum = psum + self.P_kl[i, k] likelihoods[i] = likelihoods[ i] + weight_by_Avw * np.log(self.P_kl[i, k]) # add the case of k = 0 (in the paper's notations) likelihoods[i] = likelihoods[i] + (self.s - wegihtsum) * (1 - psum) # maximum likelihood i_star = rand_argmax(likelihoods) S[i_star].add(v) #update Sprev for k in range(self.K): Sprev[k] = S[k] # (end for ind loop) for k in range(self.K): for i in S[k]: self.I_S[i] = k + 1 for i in range(self.N): if self.S[i] == 1: self.I[i] = self.I_S[self.N_to_S[i]] # for the debug (compuation of err rate) err_num = 0 for i in range(self.N): if i <= int(self.N / 2 - 1): if self.I[i] == 2: err_num = err_num + 1 if i > int(self.N / 2 - 1): if self.I[i] == 1: err_num = err_num + 1 err_rate2 = min(err_num / self.s, 1 - err_num / self.s) print('err_rate after SP=', end="") print(err_rate2) print('err_rate improvement = ', end="") print(err_rate - err_rate2)
def exploitation(self): #round robbin recommendations if self.U_0[self.u_current] == 1 and self.t <= self.T_1: # do a recommendations in a round robbins manner if self.k_prev[self.u_current] == 1: # recom from 2 k = 2 self.k_prev[self.u_current] = 2 recommendable_from_I_k = np.multiply( np.array(self.Xi_u) == 0, np.array(self.I) == k) if sum(recommendable_from_I_k) > 0: self.item_selected = rand_argmax(recommendable_from_I_k) else: self.item_selected = rand_argmin(self.Xi_u) #print('random sampling (exploi)') else: # recom from 1 k = 1 self.k_prev[self.u_current] = 1 recommendable_from_I_k = np.multiply( np.array(self.Xi_u) == 0, np.array(self.I) == k) if sum(recommendable_from_I_k) > 0: self.item_selected = rand_argmax(recommendable_from_I_k) else: self.item_selected = rand_argmin(self.Xi_u) #end round robbin else: # exploitation using L x_kl = np.zeros((self.K, 2)) for k in range(self.K): for l in range(2): x_kl[k, l] = np.max([ np.abs(self.P_kl_user[k, l] - self.rho_users[self.u_current, k]) - self.eps_users, 0 ]) L_ind = set() for l in range(2): term = 0 for k in range(self.K): cnt_k = np.sum( np.multiply(np.array(self.Xi_u), np.array(self.I) == k + 1)) term = term + cnt_k * x_kl[k, l]**2 cnt_user = np.sum(np.array(self.Xi_u)) if term < 0.01 * np.log(cnt_user): L_ind.add(l) recom_k = 0 if len(L_ind) != 0: setbestk = set() for l in L_ind: setbestk.add(self.argmax_k[l]) recom_k = random.sample(setbestk, 1) else: recom_k = random.sample(range(self.K), 1) # increment k so that it align with the actual index recom_k = recom_k[0] + 1 if recom_k == 1 or recom_k == 2: # compute recommendable items from S to current user recommendable_from_I_k = np.multiply( np.array(self.Xi_u) == 0, np.array(self.I) == recom_k) if sum(recommendable_from_I_k) > 0: self.item_selected = rand_argmax(recommendable_from_I_k) else: self.item_selected = rand_argmin(self.Xi_u) else: # do a recommendations in a round robbins manner if self.k_prev[self.u_current] == 1: # recom from 2 recom_k = 2 self.k_prev[self.u_current] = 2 recommendable_from_I_k = np.multiply( np.array(self.Xi_u) == 0, np.array(self.I) == recom_k) if sum(recommendable_from_I_k) > 0: self.item_selected = rand_argmax( recommendable_from_I_k) else: self.item_selected = rand_argmin(self.Xi_u) else: # recom from 1 recom_k = 1 self.k_prev[self.u_current] = 1 recommendable_from_I_k = np.multiply( np.array(self.Xi_u) == 0, np.array(self.I) == recom_k) if sum(recommendable_from_I_k) > 0: self.item_selected = rand_argmax( recommendable_from_I_k) else: self.item_selected = rand_argmin(self.Xi_u) #update the counter self.item_cnt[ self.item_selected] = self.item_cnt[self.item_selected] + 1 assert self.Xi_current[self.item_selected, self.u_current] == 0