示例#1
0
    def conditional_pq_gen(cond_user):
        # p,q sample conditioned on cond_user being included
        cond_user_idx = np.searchsorted(users, cond_user)
        while True:
            # p-sample using efficient representation
            pick_number = np.random.binomial(U, p)
            selected_user_indices = choice(U, pick_number, replace=False)

            # add in the user we're conditioning on selecting
            cond_user_idx = np.searchsorted(users, cond_user)
            if cond_user_idx not in selected_user_indices:
                selected_user_indices[0] = cond_user_idx

            index_pair_list = user_indices[selected_user_indices].tolist()
            p_sample_edge_list = np.concatenate(
                [edge_list[start:finish] for start, finish in index_pair_list])
            p_sample_weights = np.concatenate(
                [weights[start:finish] for start, finish in index_pair_list])
            p_sample = (p_sample_edge_list, p_sample_weights)

            # to force the user to be in the graph we must also ensure at least one of its neighbours is included
            cond_user_neighbours = edge_list[user_indices[cond_user_idx][0]:
                                             user_indices[cond_user_idx][1]][:,
                                                                             1]
            cond_neighbour = choice(cond_user_neighbours)

            # q-sample naively (but now on *much* smaller graph)
            subgraph = cond_item_p_sample(p_sample, q, cond_neighbour)
            selected_adj_mat, selected_users, selected_user_indices = edge_list_to_matrix(
                subgraph)
            yield [selected_adj_mat, selected_users, selected_user_indices]
示例#2
0
def user_p_sample(graph, p, users=None, return_split=False, seed=None):
    """
    :param graph: (nparray, nparray) of (edge_list, edge_weights)
    :param p: 0 < p < 1
    :param return_split: bool indicates whether to also return the complement of the sample, false by default
    :param seed: int
    :return: partition of the edge list into [user-samp(graph,p), G\\user-samp(graph,p)]
    """

    edge_list, weights = graph

    if users is None:
        users = np.unique(edge_list[:, 0])

    U = users.shape[0]
    if seed is not None:
        np.random.seed(seed)
    pick_number = np.random.binomial(U, p)
    selected_users = choice(users, pick_number, replace=False, seed=seed)

    in_selection = np.in1d(edge_list[:, 0], selected_users)

    if not return_split:
        return (np.copy(edge_list[in_selection]),
                np.copy(weights[in_selection]))
    else:
        out_selection = np.invert(in_selection)
        return (np.copy(edge_list[in_selection]), np.copy(weights[in_selection])), \
            (np.copy(edge_list[out_selection]), np.copy(weights[out_selection])), \
            selected_users
示例#3
0
def edge_samp(graph, k, replace=False, return_split=False):
    """
    sample a subgraph by choosing k edges uniformly at random
    :param graph: (nparray, nparray) of (edge_list, edge_weights)
    :param k: int
    :param replace: bool
    :return:
    """
    edge_list, weights = graph

    num_edges = edge_list.shape[0]
    select = choice(num_edges, k, replace=replace)
    selected_edges = np.copy(edge_list[select])
    selected_weights = np.copy(weights[select])

    if not return_split:
        return (selected_edges, selected_weights)
    else:
        not_selected = np.in1d(range(num_edges),
                               select,
                               assume_unique=True,
                               invert=True)
        not_selected_edges = np.copy(edge_list[not_selected])
        not_selected_weights = np.copy(weights[not_selected])
        return (selected_edges, selected_weights), (not_selected_edges,
                                                    not_selected_weights)
示例#4
0
def sample_nonedges2(edge_list, n, users=None, items=None):
    """
    Takes the edge list of a graph and returns a random sample of n non-edges from the graph
    Returns an 'edge list' of the sampled non-edges

    :param graph:
    :return: adjacency matrix, users, items
    """

    ret_nel = np.zeros([n, 2],
                       dtype=edge_list.dtype)  # holds returned edge list

    if users is None:
        users = np.unique(edge_list[:, 0])
    if items is None:
        items = np.unique(edge_list[:, 1])

    n_users = users.shape[0]
    n_items = items.shape[0]

    samp_u_idx = choice(n_users, n, replace=True)
    samp_i_idx = choice(n_items, n, replace=True)

    samp_el = np.zeros([n, 2], dtype=edge_list.dtype)
    samp_el[:, 0] = users[samp_u_idx]
    samp_el[:, 1] = items[samp_i_idx]

    # check which of the sampled pairs are actually non-edges
    is_zero = isin_edgelist(samp_el,
                            edge_list,
                            assume_unique=True,
                            invert=True)
    num_zero = np.sum(is_zero)

    # collect the actual non-edges
    if not num_zero == 0:
        ret_nel[0:num_zero] = samp_el[is_zero]

    # sample replacements for any pairs that were actually edges
    if num_zero < n:
        ret_nel[num_zero:] = sample_nonedges(edge_list, n - num_zero, users,
                                             items)

    return ret_nel
def fast_vert_samp_generator(graph, k, l, u_dist=None, i_dist=None):
    """
    :param graph:
    :param k:
    :param l:
    :param u_dist:
    :param i_dist:
    :return:
    """

    edge_list, weights = np.copy(graph[0]), np.copy(graph[1])

    sparse_rep = sparse.coo_matrix(
        (np.squeeze(weights), (edge_list[:, 0], edge_list[:, 1]))).tocsr()

    users = np.unique(edge_list[:, 0])
    items = np.unique(edge_list[:, 1])

    while True:
        if u_dist is None:
            selected_users = choice(users, k, replace=True)
        else:
            proto_selected_users = np.random.multinomial(k, u_dist)
            selected_users_list = []
            # build a list of selected users, where users that are selected multiple times are repeated as required
            for mult in range(proto_selected_users.max()):
                selected_users_list += np.where(proto_selected_users > mult)
            selected_users = np.concatenate(selected_users_list)

        if i_dist is None:
            selected_items = choice(items, l, replace=True)
        else:
            proto_selected_items = np.random.multinomial(l, i_dist)
            selected_items_list = []
            # build a list of selected users, where users that are selected multiple times are repeated as required
            for mult in range(proto_selected_items.max()):
                selected_items_list += np.where(proto_selected_items > mult)
            selected_items = np.concatenate(selected_items_list)

        samp = sparse_rep[selected_users]
        samp = samp[:, selected_items]

        yield samp.toarray(), selected_users, selected_items
示例#6
0
def pair_samp(graph, k):
    """
    sample a subgraph by choosing k pairs uniformly at random
    (differs from edge sampling because we allow ourselves to choose non-edges)
    :param graph: (nparray, nparray) of (edge_list, edge_weights)
    :param k: int
    :return: a list of pairs and weights, with weight=0 indicating a non-edge. WARNING: this is not the usual (edge list)
    graph structure
    """
    edge_list, weights = graph

    users = np.unique(edge_list[:, 0])
    selected_users = choice(users, k, replace=True)

    items = np.unique(edge_list[:, 1])
    selected_items = choice(items, k, replace=True)

    selected_pairs = np.c_[selected_users, selected_items]

    # populate sampled weights
    # this is very slow if done naively
    sort_by_user = edge_list[:, 0].argsort()
    el_sort = edge_list[sort_by_user]
    w_sort = weights[sort_by_user]

    selected_weights = np.zeros(k)  # default to 0
    for i in range(k):

        user_start = np.searchsorted(el_sort[:, 0], selected_pairs[i, 0])
        user_end = np.searchsorted(el_sort[:, 0], selected_pairs[i, 0] + 1)

        neighbours = el_sort[user_start:user_end, 1]

        edge_selected = np.isin(neighbours, selected_pairs[i, 1])

        if edge_selected.any():
            selected_weights[i] = w_sort[user_start + np.where(edge_selected)]

    return (selected_pairs, selected_weights)
示例#7
0
def rw_w_completion_el_generator(graph, k):
    edge_list, weights = graph

    u_n = user_neighbours(graph)
    i_n = item_neighbours(graph)

    users = np.unique(edge_list[:, 0])

    while True:
        root = choice(users, 1)[0]

        sel_users = np.zeros(k + 1, dtype=np.int)
        sel_items = np.zeros(k, dtype=np.int)

        sel_users[0] = root
        cur_user = root

        for smp in range(k):
            candidate_items, _ = u_n[cur_user]
            next_item = candidate_items[randint(candidate_items.shape[0])]

            candidate_users, _ = i_n[next_item]
            next_user = candidate_users[randint(candidate_users.shape[0])]

            sel_users[smp + 1] = next_user
            sel_items[smp] = next_item

            cur_user = next_user

        ret_el = []
        ret_w = []

        selected_items = np.unique(sel_items)

        for user in sel_users:
            neighbours, neighbours_weights = u_n[user]
            incl_neighbours_bl = np.isin(neighbours,
                                         selected_items,
                                         assume_unique=True)

            inc_neigh = neighbours[incl_neighbours_bl]
            inc_weights = neighbours_weights[incl_neighbours_bl]

            inc_el = np.zeros([inc_neigh.shape[0], 2], dtype=np.int)
            inc_el[:, 0] = user
            inc_el[:, 1] = inc_neigh

            ret_el += [inc_el]
            ret_w += [inc_weights]

        yield np.concatenate(ret_el), np.concatenate(ret_w)
示例#8
0
def sample_nonedges(edge_list, n):
    """
    comparable speed to "sample_nonedges"
    :param edge_list:
    :param n:
    :return:
    """
    weights = np.ones(edge_list.shape[0], dtype=bool)
    a, users, items = edge_list_to_matrix((edge_list, weights))
    zero_ind_row, zero_ind_col = np.where(a == 0)
    sel_zero_ind = choice(len(zero_ind_row), n, replace=True)
    zero_ind_users = users[zero_ind_row[sel_zero_ind]]
    zero_ind_items = items[zero_ind_col[sel_zero_ind]]
    return np.stack([zero_ind_users, zero_ind_items], 1)
示例#9
0
def fast_pq_sample_generator(input_graph, p, q):
    """
    generates pq subsamples from input_graph, returned as adjacency matrices.

    Each sample also returns arrays giving the identities of the selected vertices and items in input_graph

    :param input_graph:
    :param p:
    :param q:
    :return:
    """

    edge_list, weights = np.copy(input_graph[0]), np.copy(input_graph[1])

    # change the representation of the graph to allow for faster sampling

    # sort edge list by items
    item_sort = edge_list[:, 1].argsort()
    edge_list = edge_list[item_sort]
    weights = weights[item_sort]

    items = np.unique(edge_list[:, 1])
    I = items.shape[0]

    # edge_list[item_indices[j][0], item_indices[j][1]]  is edge list of all edges that include item j
    last_index = 0
    item_indices = np.zeros([I, 2], dtype=np.int32)
    for idx, item in enumerate(items):
        next_index = np.searchsorted(edge_list[:, 1], item + 1)
        item_indices[idx] = last_index, next_index
        last_index = next_index

    while True:
        # q-sample using efficient representation
        pick_number = np.random.binomial(I, q)
        selected_item_indices = choice(I, pick_number, replace=False)
        index_pair_list = item_indices[selected_item_indices].tolist()
        q_sample_edge_list = np.concatenate(
            [edge_list[start:finish] for start, finish in index_pair_list])
        q_sample_weights = np.concatenate(
            [weights[start:finish] for start, finish in index_pair_list])
        q_sample = (q_sample_edge_list, q_sample_weights)

        # p-sample naively (but now on *much* smaller graph)
        subgraph = user_p_sample2(q_sample, p)

        yield subgraph
示例#10
0
def cond_user_p_sample(graph, p, cond_user, seed=None):
    """

    :param graph: (nparray, nparray) of (edge_list, edge_weights)
    :param p: 0 < p < 1
    :param cond_user: user to include
    :param seed: int
    :return: partition of the edge list into [user-samp(graph,p), G\\user-samp(graph,p)]
    """

    edge_list, weights = np.copy(graph[0]), np.copy(graph[1])

    # sort edge list by users
    user_sort = edge_list[:, 0].argsort()
    edge_list = edge_list[user_sort]
    weights = weights[user_sort]

    users = np.unique(edge_list[:, 0])
    U = users.shape[0]

    # edge_list[user_indices[j][0], user_indices[j][1]]  is edge list of all edges that include user j
    user_indices = np.zeros([U, 2], dtype=np.int32)
    user_first_occ = np.searchsorted(edge_list[:, 0], users + 1)
    user_indices[1:, 0] = user_first_occ[:-1]
    user_indices[:, 1] = user_first_occ

    if seed is not None:
        np.random.seed(seed)

    # select the additional users
    pick_number = np.random.binomial(U, p)
    selected_users_indices = choice(U, pick_number, replace=False)

    # add in the user we're conditioning on selecting
    cond_user_idx = np.searchsorted(users, cond_user)
    if cond_user_idx not in selected_users_indices:
        selected_users_indices[0] = cond_user_idx

    # construction of the sample of edges that connect
    index_pair_list = user_indices[selected_users_indices].tolist()
    sample_edge_list = np.concatenate(
        [edge_list[start:finish] for start, finish in index_pair_list])
    sample_weights = np.concatenate(
        [weights[start:finish] for start, finish in index_pair_list])
    sample = (sample_edge_list, sample_weights)

    return sample
示例#11
0
def edge_samp_multi(graph, k, replace=False):
    """
    sample a subgraph by choosing k edges uniformly at random
    :param graph: (nparray, nparray) of (edge_list, edge_weights). Edge weights must be natural numbers!
    :param k: int
    :param replace: bool
    :return:
    """
    edge_list, weights = graph

    cw = np.cumsum(weights)

    # equivalent to selecting uniformly at random from all edges (including replicates)
    num_edges = cw[-1]
    selected_edges = choice(num_edges, k, replace=replace)
    select = np.searchsorted(cw, selected_edges)

    select_unique, selected_weights = np.unique(select, return_counts=True)

    selected_edges = np.copy(edge_list[select_unique])

    return (selected_edges, selected_weights)
示例#12
0
def fast_rw_backtracking_el_generator(graph, k, p, q):
    """
    Return random walk of length 2k rooted at uniformly chosen user,
    in (edge_list, weights) form.
    Allows repeated edges.

    Random walk works as follows:
    At user:
    with probability q, return to most recent item
    else, select random neighbour
    At item:
    with probability p, return to most recent user.
    else, select random neighbour

    :param graph:
    :param num_roots:
    :param k:
    :return:
    """
    edge_list, weights = graph

    u_n = user_neighbours(graph)
    i_n = item_neighbours(graph)

    users = np.unique(edge_list[:, 0])

    while True:
        root = choice(users, 1)[0]

        ret_el = []
        ret_w = []

        cur_user = root
        for smp in range(0, 2 * k, 2):
            """
            Exploit that backtracking n times is equivalent to including n neighbours chosen wr,
            and continuing the walk from the last one
            """
            candidate_items, ci_weights = u_n[cur_user]
            u_degree = len(candidate_items)
            num_pick = 1 + np.random.negative_binomial(1, 1. - q)
            selected_idx = choice(u_degree, num_pick, replace=True)

            sel_edges = np.zeros([num_pick, 2], dtype=np.int)
            sel_edges[:, 0] = cur_user
            sel_edges[:, 1] = candidate_items[selected_idx]

            sel_weights = ci_weights[selected_idx]

            ret_el += [sel_edges]
            ret_w += [sel_weights]

            next_item = candidate_items[selected_idx][-1]
            """
            Same deal for picking users
            """
            candidate_users, cu_weights = i_n[next_item]
            i_degree = len(candidate_users)
            num_pick = 1 + np.random.negative_binomial(1, 1. - p)
            selected_idx = choice(i_degree, num_pick, replace=True)

            sel_edges = np.zeros([num_pick, 2], dtype=np.int)
            sel_edges[:, 0] = candidate_users[selected_idx]
            sel_edges[:, 1] = next_item

            sel_weights = cu_weights[selected_idx]

            ret_el += [sel_edges]
            ret_w += [sel_weights]

            next_user = candidate_users[selected_idx][-1]

            cur_user = next_user

        yield (np.concatenate(ret_el), np.concatenate(ret_w))
示例#13
0
def rw_backtracking_el_generator(graph, k, p, q):
    """
    Return random walk of length 2k rooted at uniformly chosen user,
    in (edge_list, weights) form.
    Allows repeated edges.

    Random walk works as follows:
    At user:
    with probability q, return to most recent item
    else, select random neighbour
    At item:
    with probability p, return to most recent user.
    else, select random neighbour

    :param graph:
    :param num_roots:
    :param k:
    :return:
    """
    edge_list, weights = graph

    u_n = user_neighbours(graph)
    i_n = item_neighbours(graph)

    users = np.unique(edge_list[:, 0])

    while True:
        root = choice(users, 1)[0]

        # item that immediately preceded root in fictional walk, for easy backtracking
        candidate_prev_items, cpi_weights = u_n[root]
        prev_item_idx = np.random.randint(len(candidate_prev_items))
        prev_item = candidate_prev_items[prev_item_idx]
        edge_weight_from = cpi_weights[prev_item_idx]

        ret_el = np.zeros([2 * k, 2], dtype=np.int)
        ret_w = np.zeros(2 * k)

        cur_user = root
        for smp in range(0, 2 * k, 2):

            if np.random.binomial(1, q):
                # backtrack
                next_item = prev_item
                edge_weight_to = edge_weight_from
            else:
                candidate_items, ci_weights = u_n[cur_user]
                next_item_idx = np.random.randint(len(candidate_items))
                next_item = candidate_items[next_item_idx]
                edge_weight_to = ci_weights[next_item_idx]

            if np.random.binomial(1, p):
                # backtrack
                next_user = cur_user
                edge_weight_from = edge_weight_to
            else:
                candidate_users, cu_weights = i_n[next_item]
                next_user_idx = np.random.randint(len(candidate_users))
                next_user = candidate_users[next_user_idx]
                edge_weight_from = cu_weights[next_user_idx]

            ret_el[smp, 0] = cur_user
            ret_el[smp + 1, 0] = next_user
            ret_el[smp:smp + 2, 1] = next_item

            ret_w[smp] = edge_weight_to
            ret_w[smp + 1] = edge_weight_from

            cur_user = next_user
            prev_item = next_item

        yield (ret_el, ret_w)
示例#14
0
def user_p_sample2(graph, p, return_split=False, seed=None):
    """
    Note: this is actually only slightly (~40%) faster than naive sampling

    :param graph: (nparray, nparray) of (edge_list, edge_weights)
    :param p: 0 < p < 1
    :param return_split: bool indicates whether to also return the complement of the sample, false by default
    :param seed: int
    :return: partition of the edge list into [user-samp(graph,p), G\\user-samp(graph,p)]
    """

    edge_list, weights = np.copy(graph[0]), np.copy(graph[1])

    # sort edge list by users
    user_sort = edge_list[:, 0].argsort()
    edge_list = edge_list[user_sort]
    weights = weights[user_sort]

    users = np.unique(edge_list[:, 0])
    U = users.shape[0]

    # edge_list[user_indices[j][0], user_indices[j][1]]  is edge list of all edges that include user j
    user_indices = np.zeros([U, 2], dtype=np.int32)
    user_first_occ = np.searchsorted(edge_list[:, 0], users + 1)
    user_indices[1:, 0] = user_first_occ[:-1]
    user_indices[:, 1] = user_first_occ

    # equivalent, but easier to understand:
    # last_index = 0
    # user_indices = np.zeros([U, 2], dtype=np.int32)
    # for idx, user in enumerate(users):
    #     next_index = np.searchsorted(edge_list[:, 0], user+1)
    #     user_indices[idx] = last_index, next_index
    #     last_index = next_index

    if seed is not None:
        np.random.seed(seed)

    # select the users
    pick_number = np.random.binomial(U, p)
    selected_users_indices = choice(U, pick_number, replace=False)

    # construction of the sample of edges that connect
    index_pair_list = user_indices[selected_users_indices].tolist()
    sample_edge_list = np.concatenate(
        [edge_list[start:finish] for start, finish in index_pair_list])
    sample_weights = np.concatenate(
        [weights[start:finish] for start, finish in index_pair_list])
    sample = (sample_edge_list, sample_weights)

    if not return_split:
        return sample
    else:
        remaining_user_indices = np.where(
            np.in1d(np.arange(U), selected_users_indices, invert=True))[0]
        index_pair_list = user_indices[remaining_user_indices].tolist()
        rem_sample_edge_list = np.concatenate(
            [edge_list[start:finish] for start, finish in index_pair_list])
        rem_sample_weights = np.concatenate(
            [weights[start:finish] for start, finish in index_pair_list])
        rem_sample = (rem_sample_edge_list, rem_sample_weights)

        return sample, rem_sample
def vert_samp(graph, k, l, u_dist=None, i_dist=None):
    """
    Sampling defaults to with replacement because multinomial sampling is easier that multidimensional hypergeometric

    :param graph: graph in usual (edge_list , weights) format
    :param k: number of users in sample
    :param l: number of items in sample
    :param u_dist: sampling distribution for users (defaults to uniform with replacement)
    :param i_dist: sampling distribution for items (defaults to uniform with replacement)
    :return: adjacency matrix of the subsample, and lists of selected users and selected items
    """
    edge_list, weights = graph

    users = np.unique(edge_list[:, 0])
    items = np.unique(edge_list[:, 1])

    if u_dist is None:
        selected_users = choice(users, k, replace=True)
    else:
        proto_selected_users = np.random.multinomial(k, u_dist)
        selected_users_list = []
        # build a list of selected users, where users that are selected multiple times are repeated as required
        for mult in range(proto_selected_users.max()):
            selected_users_list += np.where(proto_selected_users > mult)
        selected_users = np.concatenate(selected_users_list)

    if i_dist is None:
        selected_items = choice(items, l, replace=True)
    else:
        proto_selected_items = np.random.multinomial(l, i_dist)
        selected_items_list = []
        # build a list of selected users, where users that are selected multiple times are repeated as required
        for mult in range(proto_selected_items.max()):
            selected_items_list += np.where(proto_selected_items > mult)
        selected_items = np.concatenate(selected_items_list)

    # for with replacement sampling
    selected_users, u_cts = np.unique(selected_users, return_counts=True)
    selected_items, i_cts = np.unique(selected_items, return_counts=True)

    # get the non-zero entries of the subsample
    u_edge_selected = np.in1d(edge_list[:, 0], selected_users)
    samp_el = np.copy(edge_list[u_edge_selected])
    i_edge_selected = np.in1d(samp_el[:, 1], selected_items)
    samp_el = samp_el[i_edge_selected]

    samp_w = np.copy(weights[u_edge_selected])
    samp_w = samp_w[i_edge_selected]

    # construct the corresponding adjacency matrix (which can have all 0 rows or columns)

    # contiguous relabelling of the edge list
    relabel_u, relabel_i = reindex_edge_list(samp_el, selected_users,
                                             selected_items).T

    # for each all 0 user j, add phantom edge [j,0] of weight 0; this hack allows for all zero rows and columns
    zero_users = np.isin(range(selected_users.size), relabel_u,
                         invert=True).nonzero()[0]  # selected users w no edges
    relabel_u = np.append(relabel_u, zero_users)
    relabel_i = np.append(relabel_i, np.zeros_like(zero_users))
    samp_w = np.append(samp_w, np.zeros_like(zero_users, dtype=np.float32))

    # and same for items
    zero_items = np.isin(range(selected_items.size), relabel_i,
                         invert=True).nonzero()[0]  # selected items w no edges
    relabel_i = np.append(relabel_i, zero_items)
    relabel_u = np.append(relabel_u, np.zeros_like(zero_items))
    samp_w = np.append(samp_w, np.zeros_like(zero_items, dtype=np.float32))

    # add in the required copies of the users that were selected multiple times
    dup_ru, dup_ri, dup_w = _add_mult_samp_users(selected_users, k, u_cts,
                                                 relabel_u, relabel_i, weights)

    relabel_u = np.append(relabel_u, dup_ru)
    relabel_i = np.append(relabel_i, dup_ri)
    samp_w = np.append(samp_w, dup_w)

    # add in the required copies of items that were selected multiple times (due to with replacement sampling)
    dup_ri, dup_ru, dup_w = _add_mult_samp_users(selected_items, l, i_cts,
                                                 relabel_i, relabel_u, weights)

    relabel_u = np.append(relabel_u, dup_ru)
    relabel_i = np.append(relabel_i, dup_ri)
    samp_w = np.append(samp_w, dup_w)

    adj_mat = np.zeros([k, l])
    adj_mat[relabel_u, relabel_i] = np.squeeze(samp_w)

    return adj_mat, selected_users, selected_items