def conditional_pq_gen(cond_user): # p,q sample conditioned on cond_user being included cond_user_idx = np.searchsorted(users, cond_user) while True: # p-sample using efficient representation pick_number = np.random.binomial(U, p) selected_user_indices = choice(U, pick_number, replace=False) # add in the user we're conditioning on selecting cond_user_idx = np.searchsorted(users, cond_user) if cond_user_idx not in selected_user_indices: selected_user_indices[0] = cond_user_idx index_pair_list = user_indices[selected_user_indices].tolist() p_sample_edge_list = np.concatenate( [edge_list[start:finish] for start, finish in index_pair_list]) p_sample_weights = np.concatenate( [weights[start:finish] for start, finish in index_pair_list]) p_sample = (p_sample_edge_list, p_sample_weights) # to force the user to be in the graph we must also ensure at least one of its neighbours is included cond_user_neighbours = edge_list[user_indices[cond_user_idx][0]: user_indices[cond_user_idx][1]][:, 1] cond_neighbour = choice(cond_user_neighbours) # q-sample naively (but now on *much* smaller graph) subgraph = cond_item_p_sample(p_sample, q, cond_neighbour) selected_adj_mat, selected_users, selected_user_indices = edge_list_to_matrix( subgraph) yield [selected_adj_mat, selected_users, selected_user_indices]
def user_p_sample(graph, p, users=None, return_split=False, seed=None): """ :param graph: (nparray, nparray) of (edge_list, edge_weights) :param p: 0 < p < 1 :param return_split: bool indicates whether to also return the complement of the sample, false by default :param seed: int :return: partition of the edge list into [user-samp(graph,p), G\\user-samp(graph,p)] """ edge_list, weights = graph if users is None: users = np.unique(edge_list[:, 0]) U = users.shape[0] if seed is not None: np.random.seed(seed) pick_number = np.random.binomial(U, p) selected_users = choice(users, pick_number, replace=False, seed=seed) in_selection = np.in1d(edge_list[:, 0], selected_users) if not return_split: return (np.copy(edge_list[in_selection]), np.copy(weights[in_selection])) else: out_selection = np.invert(in_selection) return (np.copy(edge_list[in_selection]), np.copy(weights[in_selection])), \ (np.copy(edge_list[out_selection]), np.copy(weights[out_selection])), \ selected_users
def edge_samp(graph, k, replace=False, return_split=False): """ sample a subgraph by choosing k edges uniformly at random :param graph: (nparray, nparray) of (edge_list, edge_weights) :param k: int :param replace: bool :return: """ edge_list, weights = graph num_edges = edge_list.shape[0] select = choice(num_edges, k, replace=replace) selected_edges = np.copy(edge_list[select]) selected_weights = np.copy(weights[select]) if not return_split: return (selected_edges, selected_weights) else: not_selected = np.in1d(range(num_edges), select, assume_unique=True, invert=True) not_selected_edges = np.copy(edge_list[not_selected]) not_selected_weights = np.copy(weights[not_selected]) return (selected_edges, selected_weights), (not_selected_edges, not_selected_weights)
def sample_nonedges2(edge_list, n, users=None, items=None): """ Takes the edge list of a graph and returns a random sample of n non-edges from the graph Returns an 'edge list' of the sampled non-edges :param graph: :return: adjacency matrix, users, items """ ret_nel = np.zeros([n, 2], dtype=edge_list.dtype) # holds returned edge list if users is None: users = np.unique(edge_list[:, 0]) if items is None: items = np.unique(edge_list[:, 1]) n_users = users.shape[0] n_items = items.shape[0] samp_u_idx = choice(n_users, n, replace=True) samp_i_idx = choice(n_items, n, replace=True) samp_el = np.zeros([n, 2], dtype=edge_list.dtype) samp_el[:, 0] = users[samp_u_idx] samp_el[:, 1] = items[samp_i_idx] # check which of the sampled pairs are actually non-edges is_zero = isin_edgelist(samp_el, edge_list, assume_unique=True, invert=True) num_zero = np.sum(is_zero) # collect the actual non-edges if not num_zero == 0: ret_nel[0:num_zero] = samp_el[is_zero] # sample replacements for any pairs that were actually edges if num_zero < n: ret_nel[num_zero:] = sample_nonedges(edge_list, n - num_zero, users, items) return ret_nel
def fast_vert_samp_generator(graph, k, l, u_dist=None, i_dist=None): """ :param graph: :param k: :param l: :param u_dist: :param i_dist: :return: """ edge_list, weights = np.copy(graph[0]), np.copy(graph[1]) sparse_rep = sparse.coo_matrix( (np.squeeze(weights), (edge_list[:, 0], edge_list[:, 1]))).tocsr() users = np.unique(edge_list[:, 0]) items = np.unique(edge_list[:, 1]) while True: if u_dist is None: selected_users = choice(users, k, replace=True) else: proto_selected_users = np.random.multinomial(k, u_dist) selected_users_list = [] # build a list of selected users, where users that are selected multiple times are repeated as required for mult in range(proto_selected_users.max()): selected_users_list += np.where(proto_selected_users > mult) selected_users = np.concatenate(selected_users_list) if i_dist is None: selected_items = choice(items, l, replace=True) else: proto_selected_items = np.random.multinomial(l, i_dist) selected_items_list = [] # build a list of selected users, where users that are selected multiple times are repeated as required for mult in range(proto_selected_items.max()): selected_items_list += np.where(proto_selected_items > mult) selected_items = np.concatenate(selected_items_list) samp = sparse_rep[selected_users] samp = samp[:, selected_items] yield samp.toarray(), selected_users, selected_items
def pair_samp(graph, k): """ sample a subgraph by choosing k pairs uniformly at random (differs from edge sampling because we allow ourselves to choose non-edges) :param graph: (nparray, nparray) of (edge_list, edge_weights) :param k: int :return: a list of pairs and weights, with weight=0 indicating a non-edge. WARNING: this is not the usual (edge list) graph structure """ edge_list, weights = graph users = np.unique(edge_list[:, 0]) selected_users = choice(users, k, replace=True) items = np.unique(edge_list[:, 1]) selected_items = choice(items, k, replace=True) selected_pairs = np.c_[selected_users, selected_items] # populate sampled weights # this is very slow if done naively sort_by_user = edge_list[:, 0].argsort() el_sort = edge_list[sort_by_user] w_sort = weights[sort_by_user] selected_weights = np.zeros(k) # default to 0 for i in range(k): user_start = np.searchsorted(el_sort[:, 0], selected_pairs[i, 0]) user_end = np.searchsorted(el_sort[:, 0], selected_pairs[i, 0] + 1) neighbours = el_sort[user_start:user_end, 1] edge_selected = np.isin(neighbours, selected_pairs[i, 1]) if edge_selected.any(): selected_weights[i] = w_sort[user_start + np.where(edge_selected)] return (selected_pairs, selected_weights)
def rw_w_completion_el_generator(graph, k): edge_list, weights = graph u_n = user_neighbours(graph) i_n = item_neighbours(graph) users = np.unique(edge_list[:, 0]) while True: root = choice(users, 1)[0] sel_users = np.zeros(k + 1, dtype=np.int) sel_items = np.zeros(k, dtype=np.int) sel_users[0] = root cur_user = root for smp in range(k): candidate_items, _ = u_n[cur_user] next_item = candidate_items[randint(candidate_items.shape[0])] candidate_users, _ = i_n[next_item] next_user = candidate_users[randint(candidate_users.shape[0])] sel_users[smp + 1] = next_user sel_items[smp] = next_item cur_user = next_user ret_el = [] ret_w = [] selected_items = np.unique(sel_items) for user in sel_users: neighbours, neighbours_weights = u_n[user] incl_neighbours_bl = np.isin(neighbours, selected_items, assume_unique=True) inc_neigh = neighbours[incl_neighbours_bl] inc_weights = neighbours_weights[incl_neighbours_bl] inc_el = np.zeros([inc_neigh.shape[0], 2], dtype=np.int) inc_el[:, 0] = user inc_el[:, 1] = inc_neigh ret_el += [inc_el] ret_w += [inc_weights] yield np.concatenate(ret_el), np.concatenate(ret_w)
def sample_nonedges(edge_list, n): """ comparable speed to "sample_nonedges" :param edge_list: :param n: :return: """ weights = np.ones(edge_list.shape[0], dtype=bool) a, users, items = edge_list_to_matrix((edge_list, weights)) zero_ind_row, zero_ind_col = np.where(a == 0) sel_zero_ind = choice(len(zero_ind_row), n, replace=True) zero_ind_users = users[zero_ind_row[sel_zero_ind]] zero_ind_items = items[zero_ind_col[sel_zero_ind]] return np.stack([zero_ind_users, zero_ind_items], 1)
def fast_pq_sample_generator(input_graph, p, q): """ generates pq subsamples from input_graph, returned as adjacency matrices. Each sample also returns arrays giving the identities of the selected vertices and items in input_graph :param input_graph: :param p: :param q: :return: """ edge_list, weights = np.copy(input_graph[0]), np.copy(input_graph[1]) # change the representation of the graph to allow for faster sampling # sort edge list by items item_sort = edge_list[:, 1].argsort() edge_list = edge_list[item_sort] weights = weights[item_sort] items = np.unique(edge_list[:, 1]) I = items.shape[0] # edge_list[item_indices[j][0], item_indices[j][1]] is edge list of all edges that include item j last_index = 0 item_indices = np.zeros([I, 2], dtype=np.int32) for idx, item in enumerate(items): next_index = np.searchsorted(edge_list[:, 1], item + 1) item_indices[idx] = last_index, next_index last_index = next_index while True: # q-sample using efficient representation pick_number = np.random.binomial(I, q) selected_item_indices = choice(I, pick_number, replace=False) index_pair_list = item_indices[selected_item_indices].tolist() q_sample_edge_list = np.concatenate( [edge_list[start:finish] for start, finish in index_pair_list]) q_sample_weights = np.concatenate( [weights[start:finish] for start, finish in index_pair_list]) q_sample = (q_sample_edge_list, q_sample_weights) # p-sample naively (but now on *much* smaller graph) subgraph = user_p_sample2(q_sample, p) yield subgraph
def cond_user_p_sample(graph, p, cond_user, seed=None): """ :param graph: (nparray, nparray) of (edge_list, edge_weights) :param p: 0 < p < 1 :param cond_user: user to include :param seed: int :return: partition of the edge list into [user-samp(graph,p), G\\user-samp(graph,p)] """ edge_list, weights = np.copy(graph[0]), np.copy(graph[1]) # sort edge list by users user_sort = edge_list[:, 0].argsort() edge_list = edge_list[user_sort] weights = weights[user_sort] users = np.unique(edge_list[:, 0]) U = users.shape[0] # edge_list[user_indices[j][0], user_indices[j][1]] is edge list of all edges that include user j user_indices = np.zeros([U, 2], dtype=np.int32) user_first_occ = np.searchsorted(edge_list[:, 0], users + 1) user_indices[1:, 0] = user_first_occ[:-1] user_indices[:, 1] = user_first_occ if seed is not None: np.random.seed(seed) # select the additional users pick_number = np.random.binomial(U, p) selected_users_indices = choice(U, pick_number, replace=False) # add in the user we're conditioning on selecting cond_user_idx = np.searchsorted(users, cond_user) if cond_user_idx not in selected_users_indices: selected_users_indices[0] = cond_user_idx # construction of the sample of edges that connect index_pair_list = user_indices[selected_users_indices].tolist() sample_edge_list = np.concatenate( [edge_list[start:finish] for start, finish in index_pair_list]) sample_weights = np.concatenate( [weights[start:finish] for start, finish in index_pair_list]) sample = (sample_edge_list, sample_weights) return sample
def edge_samp_multi(graph, k, replace=False): """ sample a subgraph by choosing k edges uniformly at random :param graph: (nparray, nparray) of (edge_list, edge_weights). Edge weights must be natural numbers! :param k: int :param replace: bool :return: """ edge_list, weights = graph cw = np.cumsum(weights) # equivalent to selecting uniformly at random from all edges (including replicates) num_edges = cw[-1] selected_edges = choice(num_edges, k, replace=replace) select = np.searchsorted(cw, selected_edges) select_unique, selected_weights = np.unique(select, return_counts=True) selected_edges = np.copy(edge_list[select_unique]) return (selected_edges, selected_weights)
def fast_rw_backtracking_el_generator(graph, k, p, q): """ Return random walk of length 2k rooted at uniformly chosen user, in (edge_list, weights) form. Allows repeated edges. Random walk works as follows: At user: with probability q, return to most recent item else, select random neighbour At item: with probability p, return to most recent user. else, select random neighbour :param graph: :param num_roots: :param k: :return: """ edge_list, weights = graph u_n = user_neighbours(graph) i_n = item_neighbours(graph) users = np.unique(edge_list[:, 0]) while True: root = choice(users, 1)[0] ret_el = [] ret_w = [] cur_user = root for smp in range(0, 2 * k, 2): """ Exploit that backtracking n times is equivalent to including n neighbours chosen wr, and continuing the walk from the last one """ candidate_items, ci_weights = u_n[cur_user] u_degree = len(candidate_items) num_pick = 1 + np.random.negative_binomial(1, 1. - q) selected_idx = choice(u_degree, num_pick, replace=True) sel_edges = np.zeros([num_pick, 2], dtype=np.int) sel_edges[:, 0] = cur_user sel_edges[:, 1] = candidate_items[selected_idx] sel_weights = ci_weights[selected_idx] ret_el += [sel_edges] ret_w += [sel_weights] next_item = candidate_items[selected_idx][-1] """ Same deal for picking users """ candidate_users, cu_weights = i_n[next_item] i_degree = len(candidate_users) num_pick = 1 + np.random.negative_binomial(1, 1. - p) selected_idx = choice(i_degree, num_pick, replace=True) sel_edges = np.zeros([num_pick, 2], dtype=np.int) sel_edges[:, 0] = candidate_users[selected_idx] sel_edges[:, 1] = next_item sel_weights = cu_weights[selected_idx] ret_el += [sel_edges] ret_w += [sel_weights] next_user = candidate_users[selected_idx][-1] cur_user = next_user yield (np.concatenate(ret_el), np.concatenate(ret_w))
def rw_backtracking_el_generator(graph, k, p, q): """ Return random walk of length 2k rooted at uniformly chosen user, in (edge_list, weights) form. Allows repeated edges. Random walk works as follows: At user: with probability q, return to most recent item else, select random neighbour At item: with probability p, return to most recent user. else, select random neighbour :param graph: :param num_roots: :param k: :return: """ edge_list, weights = graph u_n = user_neighbours(graph) i_n = item_neighbours(graph) users = np.unique(edge_list[:, 0]) while True: root = choice(users, 1)[0] # item that immediately preceded root in fictional walk, for easy backtracking candidate_prev_items, cpi_weights = u_n[root] prev_item_idx = np.random.randint(len(candidate_prev_items)) prev_item = candidate_prev_items[prev_item_idx] edge_weight_from = cpi_weights[prev_item_idx] ret_el = np.zeros([2 * k, 2], dtype=np.int) ret_w = np.zeros(2 * k) cur_user = root for smp in range(0, 2 * k, 2): if np.random.binomial(1, q): # backtrack next_item = prev_item edge_weight_to = edge_weight_from else: candidate_items, ci_weights = u_n[cur_user] next_item_idx = np.random.randint(len(candidate_items)) next_item = candidate_items[next_item_idx] edge_weight_to = ci_weights[next_item_idx] if np.random.binomial(1, p): # backtrack next_user = cur_user edge_weight_from = edge_weight_to else: candidate_users, cu_weights = i_n[next_item] next_user_idx = np.random.randint(len(candidate_users)) next_user = candidate_users[next_user_idx] edge_weight_from = cu_weights[next_user_idx] ret_el[smp, 0] = cur_user ret_el[smp + 1, 0] = next_user ret_el[smp:smp + 2, 1] = next_item ret_w[smp] = edge_weight_to ret_w[smp + 1] = edge_weight_from cur_user = next_user prev_item = next_item yield (ret_el, ret_w)
def user_p_sample2(graph, p, return_split=False, seed=None): """ Note: this is actually only slightly (~40%) faster than naive sampling :param graph: (nparray, nparray) of (edge_list, edge_weights) :param p: 0 < p < 1 :param return_split: bool indicates whether to also return the complement of the sample, false by default :param seed: int :return: partition of the edge list into [user-samp(graph,p), G\\user-samp(graph,p)] """ edge_list, weights = np.copy(graph[0]), np.copy(graph[1]) # sort edge list by users user_sort = edge_list[:, 0].argsort() edge_list = edge_list[user_sort] weights = weights[user_sort] users = np.unique(edge_list[:, 0]) U = users.shape[0] # edge_list[user_indices[j][0], user_indices[j][1]] is edge list of all edges that include user j user_indices = np.zeros([U, 2], dtype=np.int32) user_first_occ = np.searchsorted(edge_list[:, 0], users + 1) user_indices[1:, 0] = user_first_occ[:-1] user_indices[:, 1] = user_first_occ # equivalent, but easier to understand: # last_index = 0 # user_indices = np.zeros([U, 2], dtype=np.int32) # for idx, user in enumerate(users): # next_index = np.searchsorted(edge_list[:, 0], user+1) # user_indices[idx] = last_index, next_index # last_index = next_index if seed is not None: np.random.seed(seed) # select the users pick_number = np.random.binomial(U, p) selected_users_indices = choice(U, pick_number, replace=False) # construction of the sample of edges that connect index_pair_list = user_indices[selected_users_indices].tolist() sample_edge_list = np.concatenate( [edge_list[start:finish] for start, finish in index_pair_list]) sample_weights = np.concatenate( [weights[start:finish] for start, finish in index_pair_list]) sample = (sample_edge_list, sample_weights) if not return_split: return sample else: remaining_user_indices = np.where( np.in1d(np.arange(U), selected_users_indices, invert=True))[0] index_pair_list = user_indices[remaining_user_indices].tolist() rem_sample_edge_list = np.concatenate( [edge_list[start:finish] for start, finish in index_pair_list]) rem_sample_weights = np.concatenate( [weights[start:finish] for start, finish in index_pair_list]) rem_sample = (rem_sample_edge_list, rem_sample_weights) return sample, rem_sample
def vert_samp(graph, k, l, u_dist=None, i_dist=None): """ Sampling defaults to with replacement because multinomial sampling is easier that multidimensional hypergeometric :param graph: graph in usual (edge_list , weights) format :param k: number of users in sample :param l: number of items in sample :param u_dist: sampling distribution for users (defaults to uniform with replacement) :param i_dist: sampling distribution for items (defaults to uniform with replacement) :return: adjacency matrix of the subsample, and lists of selected users and selected items """ edge_list, weights = graph users = np.unique(edge_list[:, 0]) items = np.unique(edge_list[:, 1]) if u_dist is None: selected_users = choice(users, k, replace=True) else: proto_selected_users = np.random.multinomial(k, u_dist) selected_users_list = [] # build a list of selected users, where users that are selected multiple times are repeated as required for mult in range(proto_selected_users.max()): selected_users_list += np.where(proto_selected_users > mult) selected_users = np.concatenate(selected_users_list) if i_dist is None: selected_items = choice(items, l, replace=True) else: proto_selected_items = np.random.multinomial(l, i_dist) selected_items_list = [] # build a list of selected users, where users that are selected multiple times are repeated as required for mult in range(proto_selected_items.max()): selected_items_list += np.where(proto_selected_items > mult) selected_items = np.concatenate(selected_items_list) # for with replacement sampling selected_users, u_cts = np.unique(selected_users, return_counts=True) selected_items, i_cts = np.unique(selected_items, return_counts=True) # get the non-zero entries of the subsample u_edge_selected = np.in1d(edge_list[:, 0], selected_users) samp_el = np.copy(edge_list[u_edge_selected]) i_edge_selected = np.in1d(samp_el[:, 1], selected_items) samp_el = samp_el[i_edge_selected] samp_w = np.copy(weights[u_edge_selected]) samp_w = samp_w[i_edge_selected] # construct the corresponding adjacency matrix (which can have all 0 rows or columns) # contiguous relabelling of the edge list relabel_u, relabel_i = reindex_edge_list(samp_el, selected_users, selected_items).T # for each all 0 user j, add phantom edge [j,0] of weight 0; this hack allows for all zero rows and columns zero_users = np.isin(range(selected_users.size), relabel_u, invert=True).nonzero()[0] # selected users w no edges relabel_u = np.append(relabel_u, zero_users) relabel_i = np.append(relabel_i, np.zeros_like(zero_users)) samp_w = np.append(samp_w, np.zeros_like(zero_users, dtype=np.float32)) # and same for items zero_items = np.isin(range(selected_items.size), relabel_i, invert=True).nonzero()[0] # selected items w no edges relabel_i = np.append(relabel_i, zero_items) relabel_u = np.append(relabel_u, np.zeros_like(zero_items)) samp_w = np.append(samp_w, np.zeros_like(zero_items, dtype=np.float32)) # add in the required copies of the users that were selected multiple times dup_ru, dup_ri, dup_w = _add_mult_samp_users(selected_users, k, u_cts, relabel_u, relabel_i, weights) relabel_u = np.append(relabel_u, dup_ru) relabel_i = np.append(relabel_i, dup_ri) samp_w = np.append(samp_w, dup_w) # add in the required copies of items that were selected multiple times (due to with replacement sampling) dup_ri, dup_ru, dup_w = _add_mult_samp_users(selected_items, l, i_cts, relabel_i, relabel_u, weights) relabel_u = np.append(relabel_u, dup_ru) relabel_i = np.append(relabel_i, dup_ri) samp_w = np.append(samp_w, dup_w) adj_mat = np.zeros([k, l]) adj_mat[relabel_u, relabel_i] = np.squeeze(samp_w) return adj_mat, selected_users, selected_items