Пример #1
0
def compute_mu_and_m_confidence_interval(event_dict, node_membership,
                                         num_classes, z_alpha, duration):
    """
    Computes the confidence interval for mu and m (alpha to beta ratio)

    :param event_dict: Edge dictionary of events between all node pair.
    :param node_membership: (list) membership of every node to one of K classes.
    :param num_classes: (int) number of blocks / classes
    :param z_alpha: significance level (resulting in (1 - z_alpha) * 100 % CI)
    :param duration: the duration of the network

    :return: matrix of KxK confidence interval for mu and m
    """
    num_nodes = len(node_membership)
    agg_adj = utils.event_dict_to_aggregated_adjacency(num_nodes, event_dict)

    sample_mean, sample_var = estimate_utils.compute_sample_mean_and_variance(
        agg_adj, node_membership)
    bp_size = utils.calc_block_pair_size(node_membership, num_classes)

    z = 1 - (z_alpha / (2 * (num_classes**2)))
    ci_percentile = norm.ppf(1 - ((1 - z) / 2))

    mu_ci = ci_percentile * np.sqrt((9 * sample_mean) / (4 * bp_size))
    mu_ci /= duration

    m_ci = ci_percentile * np.sqrt(1 / (4 * bp_size * sample_mean))

    return mu_ci, m_ci
Пример #2
0
def compute_mu_pairwise_difference_confidence_interval(
        event_dict, node_membership, num_classes, mu, duration,
        block_pair_tuple_list, z_alpha):
    """
    Computes the pairwise difference if mu along with its confidence interval

    :param event_dict: Edge dictionary of events between all node pair.
    :param node_membership: (list) membership of every node to one of K classes.
    :param num_classes: (int) number of blocks / classes
    :param mu: KxK matrix of mu values for each block pair
    :param duration: the duration of the network
    :param block_pair_tuple_list: (list) of tuples for pairwise difference [(1, 1, 1, 2), (1, 1, 2, 1)]
    :param z_alpha: significance level (resulting in (1 - z_alpha) * 100 % CI)

    :return: dict with passed tuples as keys and a tuple of (difference, CI) as value
    """
    num_nodes = len(node_membership)
    agg_adj = utils.event_dict_to_aggregated_adjacency(num_nodes, event_dict)

    sample_mean, sample_var = estimate_utils.compute_sample_mean_and_variance(
        agg_adj, node_membership)
    bp_size = utils.calc_block_pair_size(node_membership, num_classes)

    z = 1 - (z_alpha / (4 * (num_classes - 1) * num_classes))
    ci_percentile = norm.ppf(1 - ((1 - z) / 2))

    pairwise_res_dict = {}
    for a, b, x, y in block_pair_tuple_list:
        diff = mu[a, b] - mu[x, y]
        sqroot = np.sqrt((9 / 4) * ((sample_mean[a, b] / bp_size[a, b]) +
                                    (sample_mean[x, y] / bp_size[x, y])))
        ci = ci_percentile * (1 / duration) * sqroot
        pairwise_res_dict[(a, b, x, y)] = (diff, ci)

    return pairwise_res_dict
Пример #3
0
def test_spectral_clustering_on_generative_model(scalar):
    params = {
        'alpha': 0.05,
        'beta': 0.08,
        'mu_diag': 0.00075 * scalar,
        'mu_off_diag': 0.00035 if sim_type == 'b' else 0.00035 * scalar,
        'scale': False,
        'number_of_nodes': 256
    }

    event_dict, true_class_assignments = utils.simulate_community_hawkes(
        params)
    num_nodes = len(true_class_assignments)
    # Spectral clustering on aggregated adjacency matrix
    agg_adj = utils.event_dict_to_aggregated_adjacency(num_nodes, event_dict)
    agg_adj_pred = spectral_cluster(agg_adj, num_classes=n_classes)
    agg_adj_sc_rand = adjusted_rand_score(true_class_assignments, agg_adj_pred)

    if not also_use_unweighted_adjacency:
        return agg_adj_sc_rand

    # Spectral clustering on aggregated adjacency matrix
    adj = utils.event_dict_to_adjacency(num_nodes, event_dict)
    adj_pred = spectral_cluster(adj, num_classes=n_classes)
    adj_sc_rand = adjusted_rand_score(true_class_assignments, adj_pred)

    return agg_adj_sc_rand, adj_sc_rand, np.sum(adj) / (num_nodes**2)
def generate_fit_block_hawkes(event_dict, node_membership,
                              bp_mu, bp_alpha, bp_beta,
                              duration, seed=None):
    """
    Generates a block model the plots its count histogram against the original event_dict.

    :param event_dict: Edge dictionary of events between all node pair.
    :param node_membership: (list) membership of every node to one of K classes.
    :param bp_mu, bp_alpha, bp_beta: Hawkes process parameters
    :param duration: duration of the network
    :param seed: seed for Block Hawkes generative process

    :return: generated_node_membership, generated_event_dict
    """

    # Generating a network
    n_nodes = len(node_membership)

    _, block_count = np.unique(node_membership, return_counts=True)
    class_prob = block_count / sum(block_count)

    generated_node_membership, generated_event_dict = block_generative_model(n_nodes, class_prob,
                                                                             bp_mu, bp_alpha, bp_beta,
                                                                             end_time=duration, seed=seed)

    generated_agg_adj = utils.event_dict_to_aggregated_adjacency(n_nodes, generated_event_dict, dtype=np.int)
    generated_deg_count_flattened = np.reshape(generated_agg_adj, (n_nodes * n_nodes))

    agg_adj = utils.event_dict_to_aggregated_adjacency(n_nodes, event_dict, dtype=np.int)
    deg_count_flattened = np.reshape(agg_adj, (n_nodes * n_nodes))

    plt.hist(deg_count_flattened, bins=30, alpha=0.5, label='Real Data', color='blue', density=True)
    plt.hist(generated_deg_count_flattened, bins=30, alpha=0.5, label='Generated Data', color='red', density=True)

    plt.legend(loc='upper right')
    plt.xlabel('Event Count')
    plt.ylabel('Density')
    plt.title(f'Histogram of the Count Matrix Real Vs. Generated Block Model Data - K: {len(class_prob)}'
              f'\n Mean Count -  Real: {np.mean(agg_adj):.3f} - Generated: {np.mean(generated_agg_adj):.3f}')
    plt.yscale("log")
    plt.show()

    return generated_node_membership, generated_event_dict
def test_spectral_clustering_on_generative_model(n_nodes):
    if agg_adj_should_fail:
        params = {
            'number_of_nodes': n_nodes,
            'alpha': 7.0,
            'beta': 8.0,
            'mu_off_diag': 0.001,
            'mu_diag': 0.002,
            'scale': False,
            'end_time': 400,
            'class_probabilities': class_prob,
            'n_cores': chip_n_cores
        }
    else:
        params = {
            'number_of_nodes': n_nodes,
            'alpha': 0.001,
            'beta': 0.008,
            'mu_off_diag': 0.001,
            'mu_diag': 0.001,
            # 'mu_diag': 0.002,
            'alpha_diag': 0.006,
            'scale': False,
            'end_time': 400,
            'class_probabilities': class_prob,
            'n_cores': chip_n_cores
        }

    # event_dict, true_class_assignments = utils.simulate_community_hawkes(
    #     params, network_name="10-block-10k-nodes-higher-mu-diff")

    event_dict, true_class_assignments = utils.simulate_community_hawkes(
        params)

    # Spectral clustering on adjacency matrix
    adj = utils.event_dict_to_adjacency(n_nodes, event_dict)
    adj_sc_pred = spectral_cluster(adj, num_classes=n_classes, verbose=False)
    adj_sc_rand = adjusted_rand_score(true_class_assignments, adj_sc_pred)

    # Spectral clustering on aggregated adjacency matrix
    agg_adj = utils.event_dict_to_aggregated_adjacency(n_nodes, event_dict)
    agg_adj_pred = spectral_cluster(agg_adj,
                                    num_classes=n_classes,
                                    verbose=False)
    agg_adj_sc_rand = adjusted_rand_score(true_class_assignments, agg_adj_pred)

    return adj_sc_rand, agg_adj_sc_rand
def calc_per_event_log_likelihood(combined_log_likelihood,
                                  train_log_likelihood, test_event_dict,
                                  test_num_nodes):
    """
    Subtracts the log-likelihood of the entire data from the train data and divides by the number of test events

    :param combined_log_likelihood: (float) log-likelihood of the entire data
    :param train_log_likelihood: (float) log-likelihood of the train data
    :param test_event_dict: event_dict of the test data
    :param test_num_nodes: Number of nodes in the test dataset

    :return: per test event log-likelihood
    """
    test_num_events = np.sum(
        utils.event_dict_to_aggregated_adjacency(test_num_nodes,
                                                 test_event_dict))
    return (combined_log_likelihood - train_log_likelihood) / test_num_events
Пример #7
0
def fit_poisson_baseline_model(event_dict,
                               num_nodes,
                               duration,
                               num_classes,
                               verbose=False):
    """
    Fits a Poisson baseline model to a network.

    :param event_dict: Edge dictionary of events between all node pair.
    :param num_nodes: (int) Total number of nodes
    :param duration: (int) duration of the network
    :param num_classes: (int) number of blocks / classes
    :param verbose: Prints fitted Poisson baseline parameters

    :return: node_membership, lambda, block_pair_events
    """
    agg_adj = utils.event_dict_to_aggregated_adjacency(num_nodes, event_dict)

    # if number of there are as many classes as nodes, assign each node to its own class
    if num_classes == num_nodes:
        node_membership = list(range(num_nodes))
    else:
        # Running spectral clustering
        node_membership = spectral_cluster(agg_adj, num_classes=num_classes)

    count_matrix = event_dict_to_block_pair_event_counts(
        event_dict, node_membership, num_classes)

    bp_lambda = estimate_poisson_lambda(count_matrix,
                                        node_membership,
                                        duration,
                                        num_classes,
                                        default_lambda=1e-10 / duration)

    # Printing information about the fit
    if verbose:
        _, block_count = np.unique(node_membership, return_counts=True)
        class_prob = block_count / sum(block_count)

        print(f"Membership percentage: ", class_prob)

        print("Lambda:")
        print(bp_lambda)

    return node_membership, bp_lambda, count_matrix
Пример #8
0
def estimate_bp_hawkes_params(event_dict,
                              node_membership,
                              duration,
                              num_classes,
                              agg_adj=None,
                              return_block_pair_events=False):
    """
    Estimate CHIP Hawkes parameters.

    :param event_dict: Edge dictionary of events between all node pair.
    :param node_membership: (list) membership of every node to one of K classes.
    :param duration: (int) duration of the network
    :param num_classes: (int) number of blocks / classes
    :param agg_adj: (optional) np array (num_nodes x num_nodes) Adjacency matrix where element ij denotes the
                    number of events between nodes i an j. If None, this will be calculated.
    :param return_block_pair_events: (bool) If True, returns the return_block_pair_events

    :return: parameters of the CHIP model -> mu, alpha, beta, m
    """

    if agg_adj is None:
        num_nodes = len(node_membership)
        agg_adj = utils.event_dict_to_aggregated_adjacency(
            num_nodes, event_dict)

    bp_mu, bp_alpha_beta_ratio = estimate_utils.estimate_hawkes_from_counts(
        agg_adj, node_membership, duration, 1e-10 / duration)

    bp_beta = np.zeros((num_classes, num_classes), dtype=np.float)
    block_pair_events = utils.event_dict_to_block_pair_events(
        event_dict, node_membership, num_classes)
    bp_size = utils.calc_block_pair_size(node_membership, num_classes)

    for b_i in range(num_classes):
        for b_j in range(num_classes):
            bp_beta[b_i, b_j], _ = estimate_utils.estimate_beta_from_events(
                block_pair_events[b_i][b_j], bp_mu[b_i, b_j],
                bp_alpha_beta_ratio[b_i, b_j], duration, bp_size[b_i, b_j])

    bp_alpha = bp_alpha_beta_ratio * bp_beta

    if return_block_pair_events:
        return bp_mu, bp_alpha, bp_beta, bp_alpha_beta_ratio, block_pair_events

    return bp_mu, bp_alpha, bp_beta, bp_alpha_beta_ratio
Пример #9
0
def test_spectral_clustering_on_generative_model(n, t, k):
    params = {'number_of_nodes': n,
              'end_time': t,
              'class_probabilities': np.ones(k) / k,
              'alpha': 0.06,
              'beta': 0.08,
              'mu_diag': 0.085,
              'mu_off_diag': 0.065,
              'scale': False,
              'n_cores': 1}

    event_dict, true_class_assignments = utils.simulate_community_hawkes(params)

    # Spectral clustering on aggregated adjacency matrix
    agg_adj = utils.event_dict_to_aggregated_adjacency(len(true_class_assignments), event_dict)
    agg_adj_pred = spectral_cluster(agg_adj, num_classes=k)
    agg_adj_sc_rand = adjusted_rand_score(true_class_assignments, agg_adj_pred)

    return agg_adj_sc_rand
def calc_mean_and_error_of_count_estiamte(n_nodes):
    params = {
        'number_of_nodes': n_nodes,
        'class_probabilities': class_probs,
        'end_time': end_time,
        'mu_diag': mu_diag,
        'mu_off_diag': mu_off_diag,
        'alpha': alpha_off_diag,
        'alpha_diag': alpha_diag,
        'beta': beta_off_diag,
        'beta_diag': beta_diag,
        'scale': False
    }

    event_dict, true_node_membership = utils.simulate_community_hawkes(params)

    invalid_cluster = True

    while invalid_cluster:
        # Spectral clustering on aggregated adjacency matrix
        agg_adj = utils.event_dict_to_aggregated_adjacency(n_nodes, event_dict)
        node_membership = spectral_cluster(agg_adj,
                                           num_classes=n_classes,
                                           verbose=False)
        unique_vals, cnts = np.unique(node_membership, return_counts=True)
        invalid_cluster = len(unique_vals) != n_classes
        if len(unique_vals) != n_classes:
            print(unique_vals, cnts)

    sc_rand = adjusted_rand_score(true_node_membership, node_membership)
    sc_rand = np.zeros(
        (n_classes, n_classes
         )) + sc_rand  # match the shape of other params to retrieve easily

    # param estimation with estimated communities
    bp_mu, bp_alpha, bp_beta, bp_alpha_beta_ratio = model_utils.estimate_bp_hawkes_params(
        event_dict, node_membership, end_time, n_classes)
    # param estimation with known communities. k_ is for known_
    k_bp_mu, k_bp_alpha, k_bp_beta, k_bp_alpha_beta_ratio = model_utils.estimate_bp_hawkes_params(
        event_dict, true_node_membership, end_time, n_classes)
    return bp_mu, bp_alpha_beta_ratio, bp_alpha, bp_beta, sc_rand, k_bp_mu, k_bp_alpha_beta_ratio, k_bp_alpha, k_bp_beta
def calc_mean_and_error_of_count_estiamte(n_nodes):
    params = {
        'number_of_nodes': n_nodes,
        'class_probabilities': class_probs,
        'end_time': end_time,
        'alpha': alpha,
        'beta': beta,
        'mu_diag': mu_diag,
        'scale': False
    }

    event_dict, node_membership = utils.simulate_community_hawkes(params)

    if estimate_alpha_beta:
        bp_mu, bp_alpha, bp_beta, bp_alpha_beta_ratio = model_utils.estimate_bp_hawkes_params(
            event_dict, node_membership, end_time, len(class_probs))
        return bp_mu, bp_alpha_beta_ratio, bp_alpha, bp_beta

    agg_adj = utils.event_dict_to_aggregated_adjacency(n_nodes, event_dict)
    bp_mu, bp_alpha_beta_ratio = estimate_hawkes_from_counts(
        agg_adj, node_membership, end_time, 1e-10 / end_time)

    return bp_mu, bp_alpha_beta_ratio
def estimate_bp_hawkes_params(event_dict, node_membership, duration,
                              num_classes):
    """
    Estimate CHIP Hawkes parameters.

    :param event_dict: Edge dictionary of events between all node pair.
    :param node_membership: (list) membership of every node to one of K classes.
    :param duration: (int) duration of the network
    :param num_classes: (int) number of blocks / classes

    :return: parameters of the CHIP model -> mu, alpha, beta, m
    """

    num_nodes = len(node_membership)

    agg_adj = utils.event_dict_to_aggregated_adjacency(num_nodes, event_dict)
    bp_mu, bp_alpha_beta_ratio = estimate_utils.estimate_hawkes_from_counts(
        agg_adj, node_membership, duration, 1e-10 / duration)

    bp_beta = np.zeros((num_classes, num_classes), dtype=np.float)
    block_pair_events = utils.event_dict_to_block_pair_events(
        event_dict, node_membership, num_classes)

    for b_i in range(num_classes):
        for b_j in range(num_classes):
            bp_size = len(np.where(node_membership == b_i)[0]) * len(
                np.where(node_membership == b_j)[0])
            if b_i == b_j:
                bp_size -= len(np.where(node_membership == b_i)[0])

            bp_beta[b_i, b_j], _ = estimate_utils.estimate_beta_from_events(
                block_pair_events[b_i][b_j], bp_mu[b_i, b_j],
                bp_alpha_beta_ratio[b_i, b_j], duration, bp_size)

    bp_alpha = bp_alpha_beta_ratio * bp_beta

    return bp_mu, bp_alpha, bp_beta, bp_alpha_beta_ratio
Пример #13
0
def plot_event_count_hist(event_dict, num_nodes, dset_title_name):
    """
    Plot Histogram of Event Count

    :param event_dict: event_dict of interactions
    :param num_nodes: number of nodes in the dataset
    :param dset_title_name: Name of the dataset to be added to the title

    :rtype: None (show hist)
    """
    event_agg_adj = utils.event_dict_to_aggregated_adjacency(
        num_nodes, event_dict)

    num_events = np.reshape(event_agg_adj, num_nodes**2)

    plt.hist(num_events, 50, density=True)
    plt.xlabel("Number of Events")
    plt.ylabel("Density")
    plt.title(
        f"Histogram of {dset_title_name}'s Number of Interactions \n"
        f" Mean Count: {np.mean(num_events):.4f}, Total count: {np.sum(num_events)}"
    )
    plt.yscale("log")
    plt.show()
    params = {
        'number_of_nodes': n_nodes,
        'alpha': 0.6,
        'beta': 0.8,
        'mu_off_diag': 0.8,
        'mu_diag': 1.6,
        'end_time': duration,
        'class_probabilities': np.ones(n_classes) / n_classes,
        'n_cores': -1
    }

    event_dict, true_class_assignments = utils.simulate_community_hawkes(
        params, network_name="local_seach_test_networks", load_if_exists=False)

    agg_adj = utils.event_dict_to_aggregated_adjacency(n_nodes, event_dict)
    spectral_node_membership = spectral_cluster(agg_adj, num_classes=n_classes)
    sc_rand = adjusted_rand_score(true_class_assignments,
                                  spectral_node_membership)
    print(f"SC Rand index: {sc_rand:.3f}")

    print("Parallel")
    tic = time.time()
    local_search_node_membership = chip_local_search(event_dict,
                                                     n_classes,
                                                     spectral_node_membership,
                                                     duration,
                                                     max_iter=10,
                                                     n_cores=34,
                                                     verbose=True)
    toc = time.time()
Пример #15
0
def fit_and_eval_poisson_baseline(train_tuple,
                                  test_tuple,
                                  combined_tuple,
                                  nodes_not_in_train,
                                  k_values_to_test,
                                  verbose=False):
    """
    Fits the Poisson baseline model to train and evaluates the log-likelihood on the test, by evaluating the
    log-likelihood on the combined dataset and subtracting the likelihood of train, dividing by number of events in test

    This model is basically like a BHM model, but models interactions as a Poisson. Keep in mind that modeling
    interactions as Poisson makes the BHM model the same as CHIP in terms of likelihood, since generating events at
    the node-pair level with lambda * 1/block pair size, is equivalent to generating at the block pair level with
    lambda, then thinning.

    :param train_tuple, test_tuple, combined_tuple: A tuple of (event dict, number of nodes, duration)
    :param nodes_not_in_train: Nodes that are in the test data, but not in the train
    :param k_values_to_test: iterable obj of number of communities to fit
    :param verbose: Prints details of the fit along the way.

    :return: (list) test log-likelihood per event for all `k_values_to_test`.
    """

    train_event_dict, train_num_nodes, train_duration = train_tuple
    test_event_dict, test_num_nodes, test_duration = test_tuple
    combined_event_dict, combined_num_nodes, combined_duration = combined_tuple

    total_tic = time.time()
    print("Log-likelihoods per event:")

    lls_per_event = []
    for num_classes in k_values_to_test:
        if verbose:
            print("K:", num_classes)

        tic = time.time()

        # Fitting the model to the train data
        train_node_membership, train_bp_lambda, train_block_count_matrix = \
            fit_poisson_baseline_model(train_event_dict, train_num_nodes, train_duration, num_classes, verbose=verbose)

        # Add nodes that were not in train to the largest block
        combined_node_membership = model_utils.assign_node_membership_for_missing_nodes(
            train_node_membership, nodes_not_in_train)

        # Calculate log-likelihood given the entire dataset
        combined_count_matrix = event_dict_to_block_pair_event_counts(
            combined_event_dict, combined_node_membership, num_classes)

        combined_log_likelihood = calc_full_log_likelihood(
            combined_count_matrix, combined_node_membership, combined_duration,
            train_bp_lambda, num_classes)

        # Calculate log-likelihood given the train dataset
        train_log_likelihood = calc_full_log_likelihood(
            train_block_count_matrix, train_node_membership, test_duration,
            train_bp_lambda, num_classes)

        # Calculate per event log likelihood
        ll_per_event = model_utils.calc_per_event_log_likelihood(
            combined_log_likelihood, train_log_likelihood, test_event_dict,
            test_num_nodes)

        toc = time.time()
        lls_per_event.append(ll_per_event)

        # Print train and test log-likelihood per event
        train_n_events = np.sum(
            utils.event_dict_to_aggregated_adjacency(train_num_nodes,
                                                     train_event_dict))
        print(
            f"K: {num_classes} - Train ll: {train_log_likelihood / train_n_events:.4f}",
            end=' - ')
        print(f"Test ll: {ll_per_event:.3f} - Took: {toc - tic:.2f}s")

    total_toc = time.time()

    print(f"Total time elapsed: {total_toc - total_tic:.2f}s")

    return lls_per_event
      largest_connected_component_only=True, train_percentage=0.8)
toc = time.time()
print(f"Loaded the dataset in {toc - tic:.1f}s")

train_num_events = utils.num_events_in_event_dict(train_event_dict)
test_num_events = utils.num_events_in_event_dict(test_event_dict)
# if verbose:
print("Train: ", "Num Nodes:", train_num_nodes, "Duration:", train_duration,
      "Num Edges:", train_num_events)
print("Test: ", "Num Nodes:", test_num_nodes, "Duration:", test_duration,
      "Num Edges:", test_num_events)

# fit Facebook Wall-posts
if fit_chip:
    tic = time.time()
    train_agg_adj = utils.event_dict_to_aggregated_adjacency(
        train_num_nodes, train_event_dict)

    if not use_agg_adj:
        train_adj = utils.event_dict_to_adjacency(train_num_nodes,
                                                  train_event_dict)
    toc = time.time()

    if verbose:
        print(f"Generated aggregated adj in {toc - tic:.1f}s")

    tic_tot = time.time()
    tic = time.time()
    # Running spectral clustering
    if use_agg_adj:
        train_node_membership = spectral_cluster(train_agg_adj,
                                                 num_classes=num_classes,
def chip_local_search_single_core(event_dict,
                                  n_classes,
                                  node_membership_init,
                                  duration,
                                  max_iter=100,
                                  verbose=True):
    """
    This function is only here for speed comparisons against the multi-core version. All parameters are the same as
    `chip_local_search`.
    """
    n_nodes = len(node_membership_init)
    node_membership = node_membership_init
    agg_adj = utils.event_dict_to_aggregated_adjacency(n_nodes,
                                                       event_dict,
                                                       dtype=np.int)

    # estimate initial params of CHIP and its log-likelihood
    (mu, alpha, beta, alpha_beta_ratio) = fit_utils.estimate_bp_hawkes_params(
        event_dict, node_membership, duration, n_classes)

    block_pair_events = utils.event_dict_to_block_pair_events(
        event_dict, node_membership, n_classes)
    init_log_lik = fit_utils.calc_full_log_likelihood(
        block_pair_events,
        node_membership,
        mu,
        alpha,
        beta,
        duration,
        n_classes,
        add_com_assig_log_prob=False)

    log_lik = init_log_lik

    for iter in range(max_iter):
        if verbose:
            print(f"Iteration {iter}...", end='\r')

        # best neighbor will hold the best node_membership update in the form of (node_index, updated_class_membership)
        best_neigh = None

        # for each of the (k-1)*n neighboring solutions
        for n_i in range(n_nodes):
            n_i_class = node_membership[n_i]

            for c_i in range(n_classes):
                if c_i == n_i_class:
                    continue
                # update node_membership temporarily
                node_membership[n_i] = c_i

                # Eval the aprox log_lik of this neighbor, by est its mu and alpha/beta and using previous beta.
                neigh_mu, neigh_alpha_beta_ratio = estimate_utils.estimate_hawkes_from_counts(
                    agg_adj,
                    node_membership,
                    duration,
                    default_mu=1e-10 / duration)
                neigh_alpha = neigh_alpha_beta_ratio * beta

                block_pair_events = utils.event_dict_to_block_pair_events(
                    event_dict, node_membership, n_classes)
                neigh_log_lik = fit_utils.calc_full_log_likelihood(
                    block_pair_events,
                    node_membership,
                    neigh_mu,
                    neigh_alpha,
                    beta,
                    duration,
                    n_classes,
                    add_com_assig_log_prob=False)

                # if log_lik if this neighbor is better than the "so far" best neighbor, use this neighbors as the best.
                if log_lik < neigh_log_lik:
                    log_lik = neigh_log_lik
                    best_neigh = (n_i, c_i)

                node_membership[n_i] = n_i_class

        # if no neighbor seem to increase log_lik, break. You're at a local optima.
        if best_neigh is None:
            if verbose:
                print(f"Local solution found with {iter} iterations.")
            break

        # if a good neighbor was found, update all CHIP params, and go for the next iteration.
        node_membership[best_neigh[0]] = best_neigh[1]
        (mu, alpha, beta,
         alpha_beta_ratio) = fit_utils.estimate_bp_hawkes_params(
             event_dict, node_membership, duration, n_classes)

        block_pair_events = utils.event_dict_to_block_pair_events(
            event_dict, node_membership, n_classes)
        log_lik = fit_utils.calc_full_log_likelihood(
            block_pair_events,
            node_membership,
            mu,
            alpha,
            beta,
            duration,
            n_classes,
            add_com_assig_log_prob=False)

    if verbose:
        print(
            f"likelihood went from {init_log_lik:.4f} to {log_lik:.4f}. "
            f"{100 * np.abs((log_lik - init_log_lik) / init_log_lik):.2f}% increase."
        )

    return node_membership
def chip_local_search(event_dict,
                      n_classes,
                      node_membership_init,
                      duration,
                      max_iter=100,
                      n_cores=-1,
                      return_fitted_param=False,
                      verbose=True):
    """
    Performs local search / hill climbing to increase log-likelihood of the model by switching the community of a single
    node at a time. For every neighboring solution only mu and m are estimated, beta is fixed to the base solution to
    lower time complexity.

    :param event_dict: Edge dictionary of events between all node pair. Output of the generative models.
    :param n_classes: (int) total number of classes/blocks
    :param node_membership_init: (list) initial membership of every node to one of K classes. Usually output of the
                                 spectral clustering
    :param duration: (int) Duration of the network
    :param max_iter: (int) maximum number of iterations to be performed by local search.
    :param n_cores: (int) number of cores to be used to parallelize the search. If -1, use all available cores.
    :param return_fitted_param: if True, return the Hawkes parameters for the model as well.
    :param verbose: If True, prints more information on local search.

    :return: local optimum node_membership if `return_fitted_param` is false.
    """
    n_nodes = len(node_membership_init)
    nodes = np.arange(n_nodes)
    node_membership = node_membership_init
    agg_adj = utils.event_dict_to_aggregated_adjacency(n_nodes,
                                                       event_dict,
                                                       dtype=np.int)

    # estimate initial params of CHIP and its log-likelihood
    (mu, alpha, beta, alpha_beta_ratio) = fit_utils.estimate_bp_hawkes_params(
        event_dict, node_membership, duration, n_classes)

    block_pair_events = utils.event_dict_to_block_pair_events(
        event_dict, node_membership, n_classes)
    init_log_lik = fit_utils.calc_full_log_likelihood(
        block_pair_events,
        node_membership,
        mu,
        alpha,
        beta,
        duration,
        n_classes,
        add_com_assig_log_prob=False)

    log_lik = init_log_lik
    n_cores = n_cores if n_cores > 0 else multiprocessing.cpu_count()
    batch_size = np.int(n_nodes / n_cores) + 1

    for iter in range(max_iter):
        if verbose:
            print(f"Iteration {iter}...", end='\r')

        # for each of the (k-1)*n neighboring solutions
        possible_solutions = Parallel(n_jobs=n_cores)(
            delayed(calc_node_neigh_solutions)
            (event_dict, n_classes, duration, node_membership, agg_adj, beta,
             log_lik, nodes[batch_size * ii:batch_size * (ii + 1)])
            for ii in range(n_cores))

        possible_solutions = np.array(possible_solutions)

        # if all returned log-likelihoods are np.nan, break. You're at a local optima.
        if np.all(np.isnan(possible_solutions[:, 2])):
            if verbose:
                print(f"Local solution found with {iter} iterations.")
            break

        max_ll_neigh_idx = np.nanargmax(possible_solutions[:, 2])

        # if a good neighbor was found, update all CHIP params, and go for the next iteration.
        node_membership[int(possible_solutions[max_ll_neigh_idx, 0])] = int(
            possible_solutions[max_ll_neigh_idx, 1])
        (mu, alpha, beta,
         alpha_beta_ratio) = fit_utils.estimate_bp_hawkes_params(
             event_dict, node_membership, duration, n_classes)

        block_pair_events = utils.event_dict_to_block_pair_events(
            event_dict, node_membership, n_classes)
        log_lik = fit_utils.calc_full_log_likelihood(
            block_pair_events,
            node_membership,
            mu,
            alpha,
            beta,
            duration,
            n_classes,
            add_com_assig_log_prob=False)

        if iter == max_iter - 1:
            print("Warning: Max iter reached!")

    if verbose:
        print(
            f"likelihood went from {init_log_lik:.4f} to {log_lik:.4f}. "
            f"{100 * np.abs((log_lik - init_log_lik) / init_log_lik):.2f}% increase."
        )

    if return_fitted_param:
        return node_membership, mu, alpha, beta

    return node_membership
Пример #19
0
        event_count_means = []

        for i in range(100):
            node_membership, event_dicts = community_generative_model(
                number_of_nodes,
                class_probabilities,
                bp_mu,
                bp_alpha,
                bp_beta,
                burnin,
                end_time,
                seed=seed)

            # dataset_utils.plot_event_count_hist(event_dicts, number_of_nodes, "Community Hawkes")
            event_agg_adj = utils.event_dict_to_aggregated_adjacency(
                number_of_nodes, event_dicts, dtype=np.int)

            # np.savetxt(f"community-hawkes-{i}.txt", event_agg_adj, delimiter=' ', fmt='%d')

            num_events = np.reshape(event_agg_adj, number_of_nodes**2)

            event_count_means.append(np.mean(num_events))

        print("mean:", np.mean(event_count_means))
        print("95% Error:",
              2 * np.std(event_count_means) / np.sqrt(len(event_count_means)))

    # print(node_membership, event_dicts.keys())
    # print(utils.event_dict_to_adjacency(number_of_nodes, event_dicts))
    # print(utils.event_dict_to_aggregated_adjacency(number_of_nodes, event_dicts))
    tic = time.time()
    fb_event_dict, fb_num_node, fb_duration = dataset_utils.load_facebook_wall(
        largest_connected_component_only=True)
    toc = time.time()

    print(f"Loaded the dataset in {toc - tic:.1f}s")

    num_events = utils.num_events_in_event_dict(fb_event_dict)
    if verbose:
        print("Num Nodes:", fb_num_node, "Duration:", fb_duration,
              "Num Edges:", num_events)

# fit Facebook Wall-posts
if fit_chip:
    tic = time.time()
    agg_adj = utils.event_dict_to_aggregated_adjacency(fb_num_node,
                                                       fb_event_dict)
    adj = utils.event_dict_to_adjacency(fb_num_node, fb_event_dict)
    toc = time.time()

    if verbose:
        print(f"Generated aggregated adj in {toc - tic:.1f}s")

    tic_tot = time.time()
    tic = time.time()
    # Running spectral clustering
    node_membership = spectral_cluster(agg_adj,
                                       num_classes=10,
                                       verbose=False,
                                       plot_eigenvalues=True)

    toc = time.time()
def fit_and_eval_community_hawkes(train_tuple,
                                  test_tuple,
                                  combined_tuple,
                                  nodes_not_in_train,
                                  k_values_to_test=(1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                    10),
                                  local_search_max_iter=0,
                                  local_search_n_cores=-1,
                                  plot_fitted_hist=False,
                                  verbose=False):
    """
    Fits the CHIP model to train and evaluates the log-likelihood on the test, by evaluating the
    log-likelihood on the combined dataset and subtracting the likelihood of train, dividing by number of events in test

    :param train_tuple, test_tuple, combined_tuple: A tuple of (event dict, number of nodes, duration)
    :param nodes_not_in_train: Nodes that are in the test data, but not in the train
    :param k_values_to_test: iterable obj of number of communities to fit
    :param local_search_max_iter: if >0, then the model is fitted using local search, else local search is not used.
    :param local_search_n_cores: Number of cores to be used for local search. Ignored if local_search_max_iter <= 0.
    :param plot_fitted_hist: If True, generates a CHIP model network based on the fitted parameters and plots a
                             histogram of the event count of real vs. fitted model.
    :param verbose: Prints details of the fit along the way.

    :return: (list) test log-likelihood per event for all `k_values_to_test`.
    """

    train_event_dict, train_num_nodes, train_duration = train_tuple
    test_event_dict, test_num_nodes, test_duration = test_tuple
    combined_event_dict, combined_num_nodes, combined_duration = combined_tuple

    total_tic = time.time()
    print("Log-likelihoods per event:")

    lls_per_event = []
    for num_classes in k_values_to_test:
        if verbose:
            print("K:", num_classes)

        tic = time.time()

        # Fitting the model to the train data
        train_node_membership, train_bp_mu, train_bp_alpha, train_bp_beta, train_block_pair_events = \
            model_utils.fit_community_model(train_event_dict, train_num_nodes, train_duration, num_classes,
                                            local_search_max_iter, local_search_n_cores,
                                            verbose=verbose)

        # Add nodes that were not in train to the largest block
        combined_node_membership = model_utils.assign_node_membership_for_missing_nodes(
            train_node_membership, nodes_not_in_train)

        # Calculate log-likelihood given the entire dataset
        combined_block_pair_events = utils.event_dict_to_block_pair_events(
            combined_event_dict, combined_node_membership, num_classes)

        combined_log_likelihood = model_utils.calc_full_log_likelihood(
            combined_block_pair_events, combined_node_membership, train_bp_mu,
            train_bp_alpha, train_bp_beta, combined_duration, num_classes)

        # Calculate log-likelihood given the train dataset
        train_log_likelihood = model_utils.calc_full_log_likelihood(
            train_block_pair_events, train_node_membership, train_bp_mu,
            train_bp_alpha, train_bp_beta, train_duration, num_classes)

        # Calculate per event log likelihood
        ll_per_event = model_utils.calc_per_event_log_likelihood(
            combined_log_likelihood, train_log_likelihood, test_event_dict,
            test_num_nodes)

        toc = time.time()
        lls_per_event.append(ll_per_event)

        # Print train and test log-likelihood per event
        train_n_events = np.sum(
            utils.event_dict_to_aggregated_adjacency(train_num_nodes,
                                                     train_event_dict))
        print(
            f"K: {num_classes} - Train ll: {train_log_likelihood / train_n_events:.4f}",
            end=' - ')
        print(f"Test ll: {ll_per_event:.3f} - Took: {toc - tic:.2f}s")

        if plot_fitted_hist:
            model_utils.generate_fit_community_hawkes(train_event_dict,
                                                      train_node_membership,
                                                      train_bp_mu,
                                                      train_bp_alpha,
                                                      train_bp_beta,
                                                      train_duration,
                                                      plot_fitted_hist,
                                                      n_cores=26)

    total_toc = time.time()

    print(f"Total time elapsed: {total_toc - total_tic:.2f}s")

    return lls_per_event
def fit_community_model(event_dict,
                        num_nodes,
                        duration,
                        num_classes,
                        local_search_max_iter,
                        local_search_n_cores,
                        verbose=False):
    """
    Fits CHIP model to a network.

    :param event_dict: Edge dictionary of events between all node pair.
    :param num_nodes: (int) Total number of nodes
    :param duration: (int) duration of the network
    :param num_classes: (int) number of blocks / classes
    :param local_search_max_iter: Maximum number of local search to be performed. If 0, no local search is done
    :param local_search_n_cores: Number of cores to parallelize local search. Only applicable if
                                 `local_search_max_iter` > 0
    :param verbose: Prints fitted Block Hawkes parameters

    :return: node_membership, mu, alpha, beta, block_pair_events
    """

    agg_adj = utils.event_dict_to_aggregated_adjacency(num_nodes, event_dict)
    # adj = utils.event_dict_to_adjacency(num_nodes, event_dict)

    # Running spectral clustering
    node_membership = spectral_cluster(agg_adj, num_classes, verbose=False)

    if local_search_max_iter > 0 and num_classes > 1:
        node_membership, bp_mu, bp_alpha, bp_beta = cls.chip_local_search(
            event_dict,
            num_classes,
            node_membership,
            duration,
            max_iter=local_search_max_iter,
            n_cores=local_search_n_cores,
            return_fitted_param=True,
            verbose=False)

        block_pair_events = utils.event_dict_to_block_pair_events(
            event_dict, node_membership, num_classes)

    else:
        bp_mu, bp_alpha_beta_ratio = estimate_utils.estimate_hawkes_from_counts(
            agg_adj, node_membership, duration, 1e-10 / duration)
        bp_beta = np.zeros((num_classes, num_classes), dtype=np.float)

        block_pair_events = utils.event_dict_to_block_pair_events(
            event_dict, node_membership, num_classes)

        for b_i in range(num_classes):
            for b_j in range(num_classes):
                bp_size = len(np.where(node_membership == b_i)[0]) * len(
                    np.where(node_membership == b_j)[0])
                if b_i == b_j:
                    bp_size -= len(np.where(node_membership == b_i)[0])

                bp_beta[b_i,
                        b_j], _ = estimate_utils.estimate_beta_from_events(
                            block_pair_events[b_i][b_j], bp_mu[b_i, b_j],
                            bp_alpha_beta_ratio[b_i, b_j], duration, bp_size)

        bp_alpha = bp_alpha_beta_ratio * bp_beta

    # Printing information about the fit
    if verbose:
        _, block_count = np.unique(node_membership, return_counts=True)
        class_prob = block_count / sum(block_count)

        print(f"Membership percentage: ", class_prob)

        print("Mu:")
        print(bp_mu)

        print("\nAlpha:")
        print(bp_alpha)

        print("\nBeta:")
        print(bp_beta)

    return node_membership, bp_mu, bp_alpha, bp_beta, block_pair_events
                if node_pair not in event_dicts:
                    event_dicts[node_pair] = []

                event_dicts[node_pair].append(event_times[e])

    return node_membership, event_dicts


# Example of generating from the Block Hawkes model
if __name__ == "__main__":
    seed = 1
    number_of_nodes = 8
    class_probabilities = [0.2, 0.4, 0.1, 0.2, 0.1]
    num_of_classes = len(class_probabilities)
    end_time = 10
    bp_mu, bp_alpha, bp_beta = utils.generate_random_hawkes_params(num_of_classes,
                                                                   mu_range=(0.1, 0.3),
                                                                   alpha_range=(0.2, 0.4),
                                                                   beta_range=(0.5, 1),
                                                                   seed=seed)

    node_membership, event_dicts = block_generative_model(number_of_nodes,
                                                          class_probabilities,
                                                          bp_mu, bp_alpha, bp_beta,
                                                          end_time, seed=seed)

    print(node_membership, event_dicts.keys())
    print(utils.event_dict_to_adjacency(number_of_nodes, event_dicts))
    print(utils.event_dict_to_aggregated_adjacency(number_of_nodes, event_dicts))

Пример #24
0
def fit_and_eval_block_hawkes(train_tuple, test_tuple, combined_tuple, nodes_not_in_train,
                              k_values_to_test=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
                              local_search_max_iter=0, local_search_n_cores=-1,
                              plot_fitted_hist=False, verbose=False):

    """
    Fits the Block Hawkes model (BHM) to train and evaluates the log-likelihood on the test, by evaluating the
    log-likelihood on the combined dataset and subtracting the likelihood of train, dividing by number of events in test

    :param train_tuple, test_tuple, combined_tuple: A tuple of (event dict, number of nodes, duration)
    :param nodes_not_in_train: Nodes that are in the test data, but not in the train
    :param k_values_to_test: iterable obj of number of communities to fit
    :param local_search_max_iter: if >0, then the model is fitted using local search, else local search is not used.
    :param local_search_n_cores: Number of cores to be used for local search. Ignored if local_search_max_iter <= 0.
    :param plot_fitted_hist: If True, plots a histogram of the event count of read vs. fitted model.
    :param verbose: Prints details of the fit along the way.

    :return: (list) test log-likelihood per event for all `k_values_to_test`.
    """

    train_event_dict, train_num_nodes, train_duration = train_tuple
    test_event_dict, test_num_nodes, test_duration = test_tuple
    combined_event_dict, combined_num_nodes, combined_duration = combined_tuple

    total_tic = time.time()
    print("Log-likelihoods per event:")

    lls_per_event = []
    for num_classes in k_values_to_test:
        if verbose:
            print("K:", num_classes)

        tic = time.time()

        # Fitting the model to the train data
        train_node_membership, train_bp_mu, train_bp_alpha, train_bp_beta, train_block_pair_events = \
            estimate_utils.fit_block_model(train_event_dict, train_num_nodes, train_duration, num_classes,
                                           local_search_max_iter, local_search_n_cores,
                                           verbose=verbose)

        # Add nodes that were not in train to the largest block
        combined_node_membership = model_utils.assign_node_membership_for_missing_nodes(train_node_membership,
                                                                                        nodes_not_in_train)

        # Calculate log-likelihood given the entire dataset
        combined_block_pair_events = estimate_utils.event_dict_to_combined_block_pair_events(combined_event_dict,
                                                                                             combined_node_membership,
                                                                                             num_classes)

        combined_log_likelihood = estimate_utils.calc_full_log_likelihood(combined_block_pair_events,
                                                                          combined_node_membership,
                                                                          train_bp_mu, train_bp_alpha, train_bp_beta,
                                                                          combined_duration, num_classes,
                                                                          add_com_assig_log_prob=True)

        # Calculate log-likelihood given the train dataset
        train_log_likelihood = estimate_utils.calc_full_log_likelihood(train_block_pair_events, train_node_membership,
                                                                       train_bp_mu, train_bp_alpha, train_bp_beta,
                                                                       train_duration, num_classes,
                                                                       add_com_assig_log_prob=True)

        # Calculate per event log likelihood
        ll_per_event = model_utils.calc_per_event_log_likelihood(combined_log_likelihood, train_log_likelihood,
                                                                 test_event_dict, test_num_nodes)

        toc = time.time()
        lls_per_event.append(ll_per_event)

        # Print train and test log-likelihood per event
        train_n_events = np.sum(utils.event_dict_to_aggregated_adjacency(train_num_nodes, train_event_dict))
        print(f"K: {num_classes} - Train ll: {train_log_likelihood / train_n_events:.4f}", end=' - ')
        print(f"Test ll: {ll_per_event:.3f} - Took: {toc - tic:.2f}s")

        # Save results
        result_file_path = f'{dataset_utils.get_script_path()}/storage/results/fb_bhm_fit'
        with open(f'{result_file_path}/k{num_classes}-model-params.pckl', 'wb') as handle:
            pickle.dump([train_node_membership, train_bp_mu, train_bp_alpha, train_bp_beta, train_block_pair_events],
                        handle, protocol=pickle.HIGHEST_PROTOCOL)

        if plot_fitted_hist:
            estimate_utils.generate_fit_block_hawkes(train_event_dict, train_node_membership,
                                                     train_bp_mu, train_bp_alpha, train_bp_beta,
                                                     train_duration)

    total_toc = time.time()

    print(f"Total time elapsed: {total_toc - total_tic:.2f}s")

    return lls_per_event
Пример #25
0
def fit_community_model(event_dict,
                        num_nodes,
                        duration,
                        num_classes,
                        local_search_max_iter,
                        local_search_n_cores,
                        verbose=False):
    """
    Fits CHIP model to a network.

    :param event_dict: Edge dictionary of events between all node pair.
    :param num_nodes: (int) Total number of nodes
    :param duration: (int) duration of the network
    :param num_classes: (int) number of blocks / classes
    :param local_search_max_iter: Maximum number of local search to be performed. If 0, no local search is done
    :param local_search_n_cores: Number of cores to parallelize local search. Only applicable if
                                 `local_search_max_iter` > 0
    :param verbose: Prints fitted Block Hawkes parameters

    :return: node_membership, mu, alpha, beta, block_pair_events
    """

    agg_adj = utils.event_dict_to_aggregated_adjacency(num_nodes, event_dict)
    # adj = utils.event_dict_to_adjacency(num_nodes, event_dict)

    # Running spectral clustering
    node_membership = spectral_cluster(agg_adj,
                                       num_classes,
                                       verbose=False,
                                       plot_eigenvalues=False)

    if local_search_max_iter > 0 and num_classes > 1:
        node_membership, bp_mu, bp_alpha, bp_beta = cls.chip_local_search(
            event_dict,
            num_classes,
            node_membership,
            duration,
            max_iter=local_search_max_iter,
            n_cores=local_search_n_cores,
            return_fitted_param=True,
            verbose=False)

        block_pair_events = utils.event_dict_to_block_pair_events(
            event_dict, node_membership, num_classes)

    else:
        (bp_mu, bp_alpha, bp_beta, bp_alpha_beta_ratio,
         block_pair_events) = estimate_bp_hawkes_params(
             event_dict,
             node_membership,
             duration,
             num_classes,
             agg_adj=agg_adj,
             return_block_pair_events=True)

    # Printing information about the fit
    if verbose:
        _, block_count = np.unique(node_membership, return_counts=True)
        class_prob = block_count / sum(block_count)

        print(f"Membership percentage: ", class_prob)

        print("Mu:")
        print(bp_mu)

        print("\nAlpha:")
        print(bp_alpha)

        print("\nBeta:")
        print(bp_beta)

    return node_membership, bp_mu, bp_alpha, bp_beta, block_pair_events