def compute_mu_and_m_confidence_interval(event_dict, node_membership, num_classes, z_alpha, duration): """ Computes the confidence interval for mu and m (alpha to beta ratio) :param event_dict: Edge dictionary of events between all node pair. :param node_membership: (list) membership of every node to one of K classes. :param num_classes: (int) number of blocks / classes :param z_alpha: significance level (resulting in (1 - z_alpha) * 100 % CI) :param duration: the duration of the network :return: matrix of KxK confidence interval for mu and m """ num_nodes = len(node_membership) agg_adj = utils.event_dict_to_aggregated_adjacency(num_nodes, event_dict) sample_mean, sample_var = estimate_utils.compute_sample_mean_and_variance( agg_adj, node_membership) bp_size = utils.calc_block_pair_size(node_membership, num_classes) z = 1 - (z_alpha / (2 * (num_classes**2))) ci_percentile = norm.ppf(1 - ((1 - z) / 2)) mu_ci = ci_percentile * np.sqrt((9 * sample_mean) / (4 * bp_size)) mu_ci /= duration m_ci = ci_percentile * np.sqrt(1 / (4 * bp_size * sample_mean)) return mu_ci, m_ci
def compute_mu_pairwise_difference_confidence_interval( event_dict, node_membership, num_classes, mu, duration, block_pair_tuple_list, z_alpha): """ Computes the pairwise difference if mu along with its confidence interval :param event_dict: Edge dictionary of events between all node pair. :param node_membership: (list) membership of every node to one of K classes. :param num_classes: (int) number of blocks / classes :param mu: KxK matrix of mu values for each block pair :param duration: the duration of the network :param block_pair_tuple_list: (list) of tuples for pairwise difference [(1, 1, 1, 2), (1, 1, 2, 1)] :param z_alpha: significance level (resulting in (1 - z_alpha) * 100 % CI) :return: dict with passed tuples as keys and a tuple of (difference, CI) as value """ num_nodes = len(node_membership) agg_adj = utils.event_dict_to_aggregated_adjacency(num_nodes, event_dict) sample_mean, sample_var = estimate_utils.compute_sample_mean_and_variance( agg_adj, node_membership) bp_size = utils.calc_block_pair_size(node_membership, num_classes) z = 1 - (z_alpha / (4 * (num_classes - 1) * num_classes)) ci_percentile = norm.ppf(1 - ((1 - z) / 2)) pairwise_res_dict = {} for a, b, x, y in block_pair_tuple_list: diff = mu[a, b] - mu[x, y] sqroot = np.sqrt((9 / 4) * ((sample_mean[a, b] / bp_size[a, b]) + (sample_mean[x, y] / bp_size[x, y]))) ci = ci_percentile * (1 / duration) * sqroot pairwise_res_dict[(a, b, x, y)] = (diff, ci) return pairwise_res_dict
def test_spectral_clustering_on_generative_model(scalar): params = { 'alpha': 0.05, 'beta': 0.08, 'mu_diag': 0.00075 * scalar, 'mu_off_diag': 0.00035 if sim_type == 'b' else 0.00035 * scalar, 'scale': False, 'number_of_nodes': 256 } event_dict, true_class_assignments = utils.simulate_community_hawkes( params) num_nodes = len(true_class_assignments) # Spectral clustering on aggregated adjacency matrix agg_adj = utils.event_dict_to_aggregated_adjacency(num_nodes, event_dict) agg_adj_pred = spectral_cluster(agg_adj, num_classes=n_classes) agg_adj_sc_rand = adjusted_rand_score(true_class_assignments, agg_adj_pred) if not also_use_unweighted_adjacency: return agg_adj_sc_rand # Spectral clustering on aggregated adjacency matrix adj = utils.event_dict_to_adjacency(num_nodes, event_dict) adj_pred = spectral_cluster(adj, num_classes=n_classes) adj_sc_rand = adjusted_rand_score(true_class_assignments, adj_pred) return agg_adj_sc_rand, adj_sc_rand, np.sum(adj) / (num_nodes**2)
def generate_fit_block_hawkes(event_dict, node_membership, bp_mu, bp_alpha, bp_beta, duration, seed=None): """ Generates a block model the plots its count histogram against the original event_dict. :param event_dict: Edge dictionary of events between all node pair. :param node_membership: (list) membership of every node to one of K classes. :param bp_mu, bp_alpha, bp_beta: Hawkes process parameters :param duration: duration of the network :param seed: seed for Block Hawkes generative process :return: generated_node_membership, generated_event_dict """ # Generating a network n_nodes = len(node_membership) _, block_count = np.unique(node_membership, return_counts=True) class_prob = block_count / sum(block_count) generated_node_membership, generated_event_dict = block_generative_model(n_nodes, class_prob, bp_mu, bp_alpha, bp_beta, end_time=duration, seed=seed) generated_agg_adj = utils.event_dict_to_aggregated_adjacency(n_nodes, generated_event_dict, dtype=np.int) generated_deg_count_flattened = np.reshape(generated_agg_adj, (n_nodes * n_nodes)) agg_adj = utils.event_dict_to_aggregated_adjacency(n_nodes, event_dict, dtype=np.int) deg_count_flattened = np.reshape(agg_adj, (n_nodes * n_nodes)) plt.hist(deg_count_flattened, bins=30, alpha=0.5, label='Real Data', color='blue', density=True) plt.hist(generated_deg_count_flattened, bins=30, alpha=0.5, label='Generated Data', color='red', density=True) plt.legend(loc='upper right') plt.xlabel('Event Count') plt.ylabel('Density') plt.title(f'Histogram of the Count Matrix Real Vs. Generated Block Model Data - K: {len(class_prob)}' f'\n Mean Count - Real: {np.mean(agg_adj):.3f} - Generated: {np.mean(generated_agg_adj):.3f}') plt.yscale("log") plt.show() return generated_node_membership, generated_event_dict
def test_spectral_clustering_on_generative_model(n_nodes): if agg_adj_should_fail: params = { 'number_of_nodes': n_nodes, 'alpha': 7.0, 'beta': 8.0, 'mu_off_diag': 0.001, 'mu_diag': 0.002, 'scale': False, 'end_time': 400, 'class_probabilities': class_prob, 'n_cores': chip_n_cores } else: params = { 'number_of_nodes': n_nodes, 'alpha': 0.001, 'beta': 0.008, 'mu_off_diag': 0.001, 'mu_diag': 0.001, # 'mu_diag': 0.002, 'alpha_diag': 0.006, 'scale': False, 'end_time': 400, 'class_probabilities': class_prob, 'n_cores': chip_n_cores } # event_dict, true_class_assignments = utils.simulate_community_hawkes( # params, network_name="10-block-10k-nodes-higher-mu-diff") event_dict, true_class_assignments = utils.simulate_community_hawkes( params) # Spectral clustering on adjacency matrix adj = utils.event_dict_to_adjacency(n_nodes, event_dict) adj_sc_pred = spectral_cluster(adj, num_classes=n_classes, verbose=False) adj_sc_rand = adjusted_rand_score(true_class_assignments, adj_sc_pred) # Spectral clustering on aggregated adjacency matrix agg_adj = utils.event_dict_to_aggregated_adjacency(n_nodes, event_dict) agg_adj_pred = spectral_cluster(agg_adj, num_classes=n_classes, verbose=False) agg_adj_sc_rand = adjusted_rand_score(true_class_assignments, agg_adj_pred) return adj_sc_rand, agg_adj_sc_rand
def calc_per_event_log_likelihood(combined_log_likelihood, train_log_likelihood, test_event_dict, test_num_nodes): """ Subtracts the log-likelihood of the entire data from the train data and divides by the number of test events :param combined_log_likelihood: (float) log-likelihood of the entire data :param train_log_likelihood: (float) log-likelihood of the train data :param test_event_dict: event_dict of the test data :param test_num_nodes: Number of nodes in the test dataset :return: per test event log-likelihood """ test_num_events = np.sum( utils.event_dict_to_aggregated_adjacency(test_num_nodes, test_event_dict)) return (combined_log_likelihood - train_log_likelihood) / test_num_events
def fit_poisson_baseline_model(event_dict, num_nodes, duration, num_classes, verbose=False): """ Fits a Poisson baseline model to a network. :param event_dict: Edge dictionary of events between all node pair. :param num_nodes: (int) Total number of nodes :param duration: (int) duration of the network :param num_classes: (int) number of blocks / classes :param verbose: Prints fitted Poisson baseline parameters :return: node_membership, lambda, block_pair_events """ agg_adj = utils.event_dict_to_aggregated_adjacency(num_nodes, event_dict) # if number of there are as many classes as nodes, assign each node to its own class if num_classes == num_nodes: node_membership = list(range(num_nodes)) else: # Running spectral clustering node_membership = spectral_cluster(agg_adj, num_classes=num_classes) count_matrix = event_dict_to_block_pair_event_counts( event_dict, node_membership, num_classes) bp_lambda = estimate_poisson_lambda(count_matrix, node_membership, duration, num_classes, default_lambda=1e-10 / duration) # Printing information about the fit if verbose: _, block_count = np.unique(node_membership, return_counts=True) class_prob = block_count / sum(block_count) print(f"Membership percentage: ", class_prob) print("Lambda:") print(bp_lambda) return node_membership, bp_lambda, count_matrix
def estimate_bp_hawkes_params(event_dict, node_membership, duration, num_classes, agg_adj=None, return_block_pair_events=False): """ Estimate CHIP Hawkes parameters. :param event_dict: Edge dictionary of events between all node pair. :param node_membership: (list) membership of every node to one of K classes. :param duration: (int) duration of the network :param num_classes: (int) number of blocks / classes :param agg_adj: (optional) np array (num_nodes x num_nodes) Adjacency matrix where element ij denotes the number of events between nodes i an j. If None, this will be calculated. :param return_block_pair_events: (bool) If True, returns the return_block_pair_events :return: parameters of the CHIP model -> mu, alpha, beta, m """ if agg_adj is None: num_nodes = len(node_membership) agg_adj = utils.event_dict_to_aggregated_adjacency( num_nodes, event_dict) bp_mu, bp_alpha_beta_ratio = estimate_utils.estimate_hawkes_from_counts( agg_adj, node_membership, duration, 1e-10 / duration) bp_beta = np.zeros((num_classes, num_classes), dtype=np.float) block_pair_events = utils.event_dict_to_block_pair_events( event_dict, node_membership, num_classes) bp_size = utils.calc_block_pair_size(node_membership, num_classes) for b_i in range(num_classes): for b_j in range(num_classes): bp_beta[b_i, b_j], _ = estimate_utils.estimate_beta_from_events( block_pair_events[b_i][b_j], bp_mu[b_i, b_j], bp_alpha_beta_ratio[b_i, b_j], duration, bp_size[b_i, b_j]) bp_alpha = bp_alpha_beta_ratio * bp_beta if return_block_pair_events: return bp_mu, bp_alpha, bp_beta, bp_alpha_beta_ratio, block_pair_events return bp_mu, bp_alpha, bp_beta, bp_alpha_beta_ratio
def test_spectral_clustering_on_generative_model(n, t, k): params = {'number_of_nodes': n, 'end_time': t, 'class_probabilities': np.ones(k) / k, 'alpha': 0.06, 'beta': 0.08, 'mu_diag': 0.085, 'mu_off_diag': 0.065, 'scale': False, 'n_cores': 1} event_dict, true_class_assignments = utils.simulate_community_hawkes(params) # Spectral clustering on aggregated adjacency matrix agg_adj = utils.event_dict_to_aggregated_adjacency(len(true_class_assignments), event_dict) agg_adj_pred = spectral_cluster(agg_adj, num_classes=k) agg_adj_sc_rand = adjusted_rand_score(true_class_assignments, agg_adj_pred) return agg_adj_sc_rand
def calc_mean_and_error_of_count_estiamte(n_nodes): params = { 'number_of_nodes': n_nodes, 'class_probabilities': class_probs, 'end_time': end_time, 'mu_diag': mu_diag, 'mu_off_diag': mu_off_diag, 'alpha': alpha_off_diag, 'alpha_diag': alpha_diag, 'beta': beta_off_diag, 'beta_diag': beta_diag, 'scale': False } event_dict, true_node_membership = utils.simulate_community_hawkes(params) invalid_cluster = True while invalid_cluster: # Spectral clustering on aggregated adjacency matrix agg_adj = utils.event_dict_to_aggregated_adjacency(n_nodes, event_dict) node_membership = spectral_cluster(agg_adj, num_classes=n_classes, verbose=False) unique_vals, cnts = np.unique(node_membership, return_counts=True) invalid_cluster = len(unique_vals) != n_classes if len(unique_vals) != n_classes: print(unique_vals, cnts) sc_rand = adjusted_rand_score(true_node_membership, node_membership) sc_rand = np.zeros( (n_classes, n_classes )) + sc_rand # match the shape of other params to retrieve easily # param estimation with estimated communities bp_mu, bp_alpha, bp_beta, bp_alpha_beta_ratio = model_utils.estimate_bp_hawkes_params( event_dict, node_membership, end_time, n_classes) # param estimation with known communities. k_ is for known_ k_bp_mu, k_bp_alpha, k_bp_beta, k_bp_alpha_beta_ratio = model_utils.estimate_bp_hawkes_params( event_dict, true_node_membership, end_time, n_classes) return bp_mu, bp_alpha_beta_ratio, bp_alpha, bp_beta, sc_rand, k_bp_mu, k_bp_alpha_beta_ratio, k_bp_alpha, k_bp_beta
def calc_mean_and_error_of_count_estiamte(n_nodes): params = { 'number_of_nodes': n_nodes, 'class_probabilities': class_probs, 'end_time': end_time, 'alpha': alpha, 'beta': beta, 'mu_diag': mu_diag, 'scale': False } event_dict, node_membership = utils.simulate_community_hawkes(params) if estimate_alpha_beta: bp_mu, bp_alpha, bp_beta, bp_alpha_beta_ratio = model_utils.estimate_bp_hawkes_params( event_dict, node_membership, end_time, len(class_probs)) return bp_mu, bp_alpha_beta_ratio, bp_alpha, bp_beta agg_adj = utils.event_dict_to_aggregated_adjacency(n_nodes, event_dict) bp_mu, bp_alpha_beta_ratio = estimate_hawkes_from_counts( agg_adj, node_membership, end_time, 1e-10 / end_time) return bp_mu, bp_alpha_beta_ratio
def estimate_bp_hawkes_params(event_dict, node_membership, duration, num_classes): """ Estimate CHIP Hawkes parameters. :param event_dict: Edge dictionary of events between all node pair. :param node_membership: (list) membership of every node to one of K classes. :param duration: (int) duration of the network :param num_classes: (int) number of blocks / classes :return: parameters of the CHIP model -> mu, alpha, beta, m """ num_nodes = len(node_membership) agg_adj = utils.event_dict_to_aggregated_adjacency(num_nodes, event_dict) bp_mu, bp_alpha_beta_ratio = estimate_utils.estimate_hawkes_from_counts( agg_adj, node_membership, duration, 1e-10 / duration) bp_beta = np.zeros((num_classes, num_classes), dtype=np.float) block_pair_events = utils.event_dict_to_block_pair_events( event_dict, node_membership, num_classes) for b_i in range(num_classes): for b_j in range(num_classes): bp_size = len(np.where(node_membership == b_i)[0]) * len( np.where(node_membership == b_j)[0]) if b_i == b_j: bp_size -= len(np.where(node_membership == b_i)[0]) bp_beta[b_i, b_j], _ = estimate_utils.estimate_beta_from_events( block_pair_events[b_i][b_j], bp_mu[b_i, b_j], bp_alpha_beta_ratio[b_i, b_j], duration, bp_size) bp_alpha = bp_alpha_beta_ratio * bp_beta return bp_mu, bp_alpha, bp_beta, bp_alpha_beta_ratio
def plot_event_count_hist(event_dict, num_nodes, dset_title_name): """ Plot Histogram of Event Count :param event_dict: event_dict of interactions :param num_nodes: number of nodes in the dataset :param dset_title_name: Name of the dataset to be added to the title :rtype: None (show hist) """ event_agg_adj = utils.event_dict_to_aggregated_adjacency( num_nodes, event_dict) num_events = np.reshape(event_agg_adj, num_nodes**2) plt.hist(num_events, 50, density=True) plt.xlabel("Number of Events") plt.ylabel("Density") plt.title( f"Histogram of {dset_title_name}'s Number of Interactions \n" f" Mean Count: {np.mean(num_events):.4f}, Total count: {np.sum(num_events)}" ) plt.yscale("log") plt.show()
params = { 'number_of_nodes': n_nodes, 'alpha': 0.6, 'beta': 0.8, 'mu_off_diag': 0.8, 'mu_diag': 1.6, 'end_time': duration, 'class_probabilities': np.ones(n_classes) / n_classes, 'n_cores': -1 } event_dict, true_class_assignments = utils.simulate_community_hawkes( params, network_name="local_seach_test_networks", load_if_exists=False) agg_adj = utils.event_dict_to_aggregated_adjacency(n_nodes, event_dict) spectral_node_membership = spectral_cluster(agg_adj, num_classes=n_classes) sc_rand = adjusted_rand_score(true_class_assignments, spectral_node_membership) print(f"SC Rand index: {sc_rand:.3f}") print("Parallel") tic = time.time() local_search_node_membership = chip_local_search(event_dict, n_classes, spectral_node_membership, duration, max_iter=10, n_cores=34, verbose=True) toc = time.time()
def fit_and_eval_poisson_baseline(train_tuple, test_tuple, combined_tuple, nodes_not_in_train, k_values_to_test, verbose=False): """ Fits the Poisson baseline model to train and evaluates the log-likelihood on the test, by evaluating the log-likelihood on the combined dataset and subtracting the likelihood of train, dividing by number of events in test This model is basically like a BHM model, but models interactions as a Poisson. Keep in mind that modeling interactions as Poisson makes the BHM model the same as CHIP in terms of likelihood, since generating events at the node-pair level with lambda * 1/block pair size, is equivalent to generating at the block pair level with lambda, then thinning. :param train_tuple, test_tuple, combined_tuple: A tuple of (event dict, number of nodes, duration) :param nodes_not_in_train: Nodes that are in the test data, but not in the train :param k_values_to_test: iterable obj of number of communities to fit :param verbose: Prints details of the fit along the way. :return: (list) test log-likelihood per event for all `k_values_to_test`. """ train_event_dict, train_num_nodes, train_duration = train_tuple test_event_dict, test_num_nodes, test_duration = test_tuple combined_event_dict, combined_num_nodes, combined_duration = combined_tuple total_tic = time.time() print("Log-likelihoods per event:") lls_per_event = [] for num_classes in k_values_to_test: if verbose: print("K:", num_classes) tic = time.time() # Fitting the model to the train data train_node_membership, train_bp_lambda, train_block_count_matrix = \ fit_poisson_baseline_model(train_event_dict, train_num_nodes, train_duration, num_classes, verbose=verbose) # Add nodes that were not in train to the largest block combined_node_membership = model_utils.assign_node_membership_for_missing_nodes( train_node_membership, nodes_not_in_train) # Calculate log-likelihood given the entire dataset combined_count_matrix = event_dict_to_block_pair_event_counts( combined_event_dict, combined_node_membership, num_classes) combined_log_likelihood = calc_full_log_likelihood( combined_count_matrix, combined_node_membership, combined_duration, train_bp_lambda, num_classes) # Calculate log-likelihood given the train dataset train_log_likelihood = calc_full_log_likelihood( train_block_count_matrix, train_node_membership, test_duration, train_bp_lambda, num_classes) # Calculate per event log likelihood ll_per_event = model_utils.calc_per_event_log_likelihood( combined_log_likelihood, train_log_likelihood, test_event_dict, test_num_nodes) toc = time.time() lls_per_event.append(ll_per_event) # Print train and test log-likelihood per event train_n_events = np.sum( utils.event_dict_to_aggregated_adjacency(train_num_nodes, train_event_dict)) print( f"K: {num_classes} - Train ll: {train_log_likelihood / train_n_events:.4f}", end=' - ') print(f"Test ll: {ll_per_event:.3f} - Took: {toc - tic:.2f}s") total_toc = time.time() print(f"Total time elapsed: {total_toc - total_tic:.2f}s") return lls_per_event
largest_connected_component_only=True, train_percentage=0.8) toc = time.time() print(f"Loaded the dataset in {toc - tic:.1f}s") train_num_events = utils.num_events_in_event_dict(train_event_dict) test_num_events = utils.num_events_in_event_dict(test_event_dict) # if verbose: print("Train: ", "Num Nodes:", train_num_nodes, "Duration:", train_duration, "Num Edges:", train_num_events) print("Test: ", "Num Nodes:", test_num_nodes, "Duration:", test_duration, "Num Edges:", test_num_events) # fit Facebook Wall-posts if fit_chip: tic = time.time() train_agg_adj = utils.event_dict_to_aggregated_adjacency( train_num_nodes, train_event_dict) if not use_agg_adj: train_adj = utils.event_dict_to_adjacency(train_num_nodes, train_event_dict) toc = time.time() if verbose: print(f"Generated aggregated adj in {toc - tic:.1f}s") tic_tot = time.time() tic = time.time() # Running spectral clustering if use_agg_adj: train_node_membership = spectral_cluster(train_agg_adj, num_classes=num_classes,
def chip_local_search_single_core(event_dict, n_classes, node_membership_init, duration, max_iter=100, verbose=True): """ This function is only here for speed comparisons against the multi-core version. All parameters are the same as `chip_local_search`. """ n_nodes = len(node_membership_init) node_membership = node_membership_init agg_adj = utils.event_dict_to_aggregated_adjacency(n_nodes, event_dict, dtype=np.int) # estimate initial params of CHIP and its log-likelihood (mu, alpha, beta, alpha_beta_ratio) = fit_utils.estimate_bp_hawkes_params( event_dict, node_membership, duration, n_classes) block_pair_events = utils.event_dict_to_block_pair_events( event_dict, node_membership, n_classes) init_log_lik = fit_utils.calc_full_log_likelihood( block_pair_events, node_membership, mu, alpha, beta, duration, n_classes, add_com_assig_log_prob=False) log_lik = init_log_lik for iter in range(max_iter): if verbose: print(f"Iteration {iter}...", end='\r') # best neighbor will hold the best node_membership update in the form of (node_index, updated_class_membership) best_neigh = None # for each of the (k-1)*n neighboring solutions for n_i in range(n_nodes): n_i_class = node_membership[n_i] for c_i in range(n_classes): if c_i == n_i_class: continue # update node_membership temporarily node_membership[n_i] = c_i # Eval the aprox log_lik of this neighbor, by est its mu and alpha/beta and using previous beta. neigh_mu, neigh_alpha_beta_ratio = estimate_utils.estimate_hawkes_from_counts( agg_adj, node_membership, duration, default_mu=1e-10 / duration) neigh_alpha = neigh_alpha_beta_ratio * beta block_pair_events = utils.event_dict_to_block_pair_events( event_dict, node_membership, n_classes) neigh_log_lik = fit_utils.calc_full_log_likelihood( block_pair_events, node_membership, neigh_mu, neigh_alpha, beta, duration, n_classes, add_com_assig_log_prob=False) # if log_lik if this neighbor is better than the "so far" best neighbor, use this neighbors as the best. if log_lik < neigh_log_lik: log_lik = neigh_log_lik best_neigh = (n_i, c_i) node_membership[n_i] = n_i_class # if no neighbor seem to increase log_lik, break. You're at a local optima. if best_neigh is None: if verbose: print(f"Local solution found with {iter} iterations.") break # if a good neighbor was found, update all CHIP params, and go for the next iteration. node_membership[best_neigh[0]] = best_neigh[1] (mu, alpha, beta, alpha_beta_ratio) = fit_utils.estimate_bp_hawkes_params( event_dict, node_membership, duration, n_classes) block_pair_events = utils.event_dict_to_block_pair_events( event_dict, node_membership, n_classes) log_lik = fit_utils.calc_full_log_likelihood( block_pair_events, node_membership, mu, alpha, beta, duration, n_classes, add_com_assig_log_prob=False) if verbose: print( f"likelihood went from {init_log_lik:.4f} to {log_lik:.4f}. " f"{100 * np.abs((log_lik - init_log_lik) / init_log_lik):.2f}% increase." ) return node_membership
def chip_local_search(event_dict, n_classes, node_membership_init, duration, max_iter=100, n_cores=-1, return_fitted_param=False, verbose=True): """ Performs local search / hill climbing to increase log-likelihood of the model by switching the community of a single node at a time. For every neighboring solution only mu and m are estimated, beta is fixed to the base solution to lower time complexity. :param event_dict: Edge dictionary of events between all node pair. Output of the generative models. :param n_classes: (int) total number of classes/blocks :param node_membership_init: (list) initial membership of every node to one of K classes. Usually output of the spectral clustering :param duration: (int) Duration of the network :param max_iter: (int) maximum number of iterations to be performed by local search. :param n_cores: (int) number of cores to be used to parallelize the search. If -1, use all available cores. :param return_fitted_param: if True, return the Hawkes parameters for the model as well. :param verbose: If True, prints more information on local search. :return: local optimum node_membership if `return_fitted_param` is false. """ n_nodes = len(node_membership_init) nodes = np.arange(n_nodes) node_membership = node_membership_init agg_adj = utils.event_dict_to_aggregated_adjacency(n_nodes, event_dict, dtype=np.int) # estimate initial params of CHIP and its log-likelihood (mu, alpha, beta, alpha_beta_ratio) = fit_utils.estimate_bp_hawkes_params( event_dict, node_membership, duration, n_classes) block_pair_events = utils.event_dict_to_block_pair_events( event_dict, node_membership, n_classes) init_log_lik = fit_utils.calc_full_log_likelihood( block_pair_events, node_membership, mu, alpha, beta, duration, n_classes, add_com_assig_log_prob=False) log_lik = init_log_lik n_cores = n_cores if n_cores > 0 else multiprocessing.cpu_count() batch_size = np.int(n_nodes / n_cores) + 1 for iter in range(max_iter): if verbose: print(f"Iteration {iter}...", end='\r') # for each of the (k-1)*n neighboring solutions possible_solutions = Parallel(n_jobs=n_cores)( delayed(calc_node_neigh_solutions) (event_dict, n_classes, duration, node_membership, agg_adj, beta, log_lik, nodes[batch_size * ii:batch_size * (ii + 1)]) for ii in range(n_cores)) possible_solutions = np.array(possible_solutions) # if all returned log-likelihoods are np.nan, break. You're at a local optima. if np.all(np.isnan(possible_solutions[:, 2])): if verbose: print(f"Local solution found with {iter} iterations.") break max_ll_neigh_idx = np.nanargmax(possible_solutions[:, 2]) # if a good neighbor was found, update all CHIP params, and go for the next iteration. node_membership[int(possible_solutions[max_ll_neigh_idx, 0])] = int( possible_solutions[max_ll_neigh_idx, 1]) (mu, alpha, beta, alpha_beta_ratio) = fit_utils.estimate_bp_hawkes_params( event_dict, node_membership, duration, n_classes) block_pair_events = utils.event_dict_to_block_pair_events( event_dict, node_membership, n_classes) log_lik = fit_utils.calc_full_log_likelihood( block_pair_events, node_membership, mu, alpha, beta, duration, n_classes, add_com_assig_log_prob=False) if iter == max_iter - 1: print("Warning: Max iter reached!") if verbose: print( f"likelihood went from {init_log_lik:.4f} to {log_lik:.4f}. " f"{100 * np.abs((log_lik - init_log_lik) / init_log_lik):.2f}% increase." ) if return_fitted_param: return node_membership, mu, alpha, beta return node_membership
event_count_means = [] for i in range(100): node_membership, event_dicts = community_generative_model( number_of_nodes, class_probabilities, bp_mu, bp_alpha, bp_beta, burnin, end_time, seed=seed) # dataset_utils.plot_event_count_hist(event_dicts, number_of_nodes, "Community Hawkes") event_agg_adj = utils.event_dict_to_aggregated_adjacency( number_of_nodes, event_dicts, dtype=np.int) # np.savetxt(f"community-hawkes-{i}.txt", event_agg_adj, delimiter=' ', fmt='%d') num_events = np.reshape(event_agg_adj, number_of_nodes**2) event_count_means.append(np.mean(num_events)) print("mean:", np.mean(event_count_means)) print("95% Error:", 2 * np.std(event_count_means) / np.sqrt(len(event_count_means))) # print(node_membership, event_dicts.keys()) # print(utils.event_dict_to_adjacency(number_of_nodes, event_dicts)) # print(utils.event_dict_to_aggregated_adjacency(number_of_nodes, event_dicts))
tic = time.time() fb_event_dict, fb_num_node, fb_duration = dataset_utils.load_facebook_wall( largest_connected_component_only=True) toc = time.time() print(f"Loaded the dataset in {toc - tic:.1f}s") num_events = utils.num_events_in_event_dict(fb_event_dict) if verbose: print("Num Nodes:", fb_num_node, "Duration:", fb_duration, "Num Edges:", num_events) # fit Facebook Wall-posts if fit_chip: tic = time.time() agg_adj = utils.event_dict_to_aggregated_adjacency(fb_num_node, fb_event_dict) adj = utils.event_dict_to_adjacency(fb_num_node, fb_event_dict) toc = time.time() if verbose: print(f"Generated aggregated adj in {toc - tic:.1f}s") tic_tot = time.time() tic = time.time() # Running spectral clustering node_membership = spectral_cluster(agg_adj, num_classes=10, verbose=False, plot_eigenvalues=True) toc = time.time()
def fit_and_eval_community_hawkes(train_tuple, test_tuple, combined_tuple, nodes_not_in_train, k_values_to_test=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), local_search_max_iter=0, local_search_n_cores=-1, plot_fitted_hist=False, verbose=False): """ Fits the CHIP model to train and evaluates the log-likelihood on the test, by evaluating the log-likelihood on the combined dataset and subtracting the likelihood of train, dividing by number of events in test :param train_tuple, test_tuple, combined_tuple: A tuple of (event dict, number of nodes, duration) :param nodes_not_in_train: Nodes that are in the test data, but not in the train :param k_values_to_test: iterable obj of number of communities to fit :param local_search_max_iter: if >0, then the model is fitted using local search, else local search is not used. :param local_search_n_cores: Number of cores to be used for local search. Ignored if local_search_max_iter <= 0. :param plot_fitted_hist: If True, generates a CHIP model network based on the fitted parameters and plots a histogram of the event count of real vs. fitted model. :param verbose: Prints details of the fit along the way. :return: (list) test log-likelihood per event for all `k_values_to_test`. """ train_event_dict, train_num_nodes, train_duration = train_tuple test_event_dict, test_num_nodes, test_duration = test_tuple combined_event_dict, combined_num_nodes, combined_duration = combined_tuple total_tic = time.time() print("Log-likelihoods per event:") lls_per_event = [] for num_classes in k_values_to_test: if verbose: print("K:", num_classes) tic = time.time() # Fitting the model to the train data train_node_membership, train_bp_mu, train_bp_alpha, train_bp_beta, train_block_pair_events = \ model_utils.fit_community_model(train_event_dict, train_num_nodes, train_duration, num_classes, local_search_max_iter, local_search_n_cores, verbose=verbose) # Add nodes that were not in train to the largest block combined_node_membership = model_utils.assign_node_membership_for_missing_nodes( train_node_membership, nodes_not_in_train) # Calculate log-likelihood given the entire dataset combined_block_pair_events = utils.event_dict_to_block_pair_events( combined_event_dict, combined_node_membership, num_classes) combined_log_likelihood = model_utils.calc_full_log_likelihood( combined_block_pair_events, combined_node_membership, train_bp_mu, train_bp_alpha, train_bp_beta, combined_duration, num_classes) # Calculate log-likelihood given the train dataset train_log_likelihood = model_utils.calc_full_log_likelihood( train_block_pair_events, train_node_membership, train_bp_mu, train_bp_alpha, train_bp_beta, train_duration, num_classes) # Calculate per event log likelihood ll_per_event = model_utils.calc_per_event_log_likelihood( combined_log_likelihood, train_log_likelihood, test_event_dict, test_num_nodes) toc = time.time() lls_per_event.append(ll_per_event) # Print train and test log-likelihood per event train_n_events = np.sum( utils.event_dict_to_aggregated_adjacency(train_num_nodes, train_event_dict)) print( f"K: {num_classes} - Train ll: {train_log_likelihood / train_n_events:.4f}", end=' - ') print(f"Test ll: {ll_per_event:.3f} - Took: {toc - tic:.2f}s") if plot_fitted_hist: model_utils.generate_fit_community_hawkes(train_event_dict, train_node_membership, train_bp_mu, train_bp_alpha, train_bp_beta, train_duration, plot_fitted_hist, n_cores=26) total_toc = time.time() print(f"Total time elapsed: {total_toc - total_tic:.2f}s") return lls_per_event
def fit_community_model(event_dict, num_nodes, duration, num_classes, local_search_max_iter, local_search_n_cores, verbose=False): """ Fits CHIP model to a network. :param event_dict: Edge dictionary of events between all node pair. :param num_nodes: (int) Total number of nodes :param duration: (int) duration of the network :param num_classes: (int) number of blocks / classes :param local_search_max_iter: Maximum number of local search to be performed. If 0, no local search is done :param local_search_n_cores: Number of cores to parallelize local search. Only applicable if `local_search_max_iter` > 0 :param verbose: Prints fitted Block Hawkes parameters :return: node_membership, mu, alpha, beta, block_pair_events """ agg_adj = utils.event_dict_to_aggregated_adjacency(num_nodes, event_dict) # adj = utils.event_dict_to_adjacency(num_nodes, event_dict) # Running spectral clustering node_membership = spectral_cluster(agg_adj, num_classes, verbose=False) if local_search_max_iter > 0 and num_classes > 1: node_membership, bp_mu, bp_alpha, bp_beta = cls.chip_local_search( event_dict, num_classes, node_membership, duration, max_iter=local_search_max_iter, n_cores=local_search_n_cores, return_fitted_param=True, verbose=False) block_pair_events = utils.event_dict_to_block_pair_events( event_dict, node_membership, num_classes) else: bp_mu, bp_alpha_beta_ratio = estimate_utils.estimate_hawkes_from_counts( agg_adj, node_membership, duration, 1e-10 / duration) bp_beta = np.zeros((num_classes, num_classes), dtype=np.float) block_pair_events = utils.event_dict_to_block_pair_events( event_dict, node_membership, num_classes) for b_i in range(num_classes): for b_j in range(num_classes): bp_size = len(np.where(node_membership == b_i)[0]) * len( np.where(node_membership == b_j)[0]) if b_i == b_j: bp_size -= len(np.where(node_membership == b_i)[0]) bp_beta[b_i, b_j], _ = estimate_utils.estimate_beta_from_events( block_pair_events[b_i][b_j], bp_mu[b_i, b_j], bp_alpha_beta_ratio[b_i, b_j], duration, bp_size) bp_alpha = bp_alpha_beta_ratio * bp_beta # Printing information about the fit if verbose: _, block_count = np.unique(node_membership, return_counts=True) class_prob = block_count / sum(block_count) print(f"Membership percentage: ", class_prob) print("Mu:") print(bp_mu) print("\nAlpha:") print(bp_alpha) print("\nBeta:") print(bp_beta) return node_membership, bp_mu, bp_alpha, bp_beta, block_pair_events
if node_pair not in event_dicts: event_dicts[node_pair] = [] event_dicts[node_pair].append(event_times[e]) return node_membership, event_dicts # Example of generating from the Block Hawkes model if __name__ == "__main__": seed = 1 number_of_nodes = 8 class_probabilities = [0.2, 0.4, 0.1, 0.2, 0.1] num_of_classes = len(class_probabilities) end_time = 10 bp_mu, bp_alpha, bp_beta = utils.generate_random_hawkes_params(num_of_classes, mu_range=(0.1, 0.3), alpha_range=(0.2, 0.4), beta_range=(0.5, 1), seed=seed) node_membership, event_dicts = block_generative_model(number_of_nodes, class_probabilities, bp_mu, bp_alpha, bp_beta, end_time, seed=seed) print(node_membership, event_dicts.keys()) print(utils.event_dict_to_adjacency(number_of_nodes, event_dicts)) print(utils.event_dict_to_aggregated_adjacency(number_of_nodes, event_dicts))
def fit_and_eval_block_hawkes(train_tuple, test_tuple, combined_tuple, nodes_not_in_train, k_values_to_test=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), local_search_max_iter=0, local_search_n_cores=-1, plot_fitted_hist=False, verbose=False): """ Fits the Block Hawkes model (BHM) to train and evaluates the log-likelihood on the test, by evaluating the log-likelihood on the combined dataset and subtracting the likelihood of train, dividing by number of events in test :param train_tuple, test_tuple, combined_tuple: A tuple of (event dict, number of nodes, duration) :param nodes_not_in_train: Nodes that are in the test data, but not in the train :param k_values_to_test: iterable obj of number of communities to fit :param local_search_max_iter: if >0, then the model is fitted using local search, else local search is not used. :param local_search_n_cores: Number of cores to be used for local search. Ignored if local_search_max_iter <= 0. :param plot_fitted_hist: If True, plots a histogram of the event count of read vs. fitted model. :param verbose: Prints details of the fit along the way. :return: (list) test log-likelihood per event for all `k_values_to_test`. """ train_event_dict, train_num_nodes, train_duration = train_tuple test_event_dict, test_num_nodes, test_duration = test_tuple combined_event_dict, combined_num_nodes, combined_duration = combined_tuple total_tic = time.time() print("Log-likelihoods per event:") lls_per_event = [] for num_classes in k_values_to_test: if verbose: print("K:", num_classes) tic = time.time() # Fitting the model to the train data train_node_membership, train_bp_mu, train_bp_alpha, train_bp_beta, train_block_pair_events = \ estimate_utils.fit_block_model(train_event_dict, train_num_nodes, train_duration, num_classes, local_search_max_iter, local_search_n_cores, verbose=verbose) # Add nodes that were not in train to the largest block combined_node_membership = model_utils.assign_node_membership_for_missing_nodes(train_node_membership, nodes_not_in_train) # Calculate log-likelihood given the entire dataset combined_block_pair_events = estimate_utils.event_dict_to_combined_block_pair_events(combined_event_dict, combined_node_membership, num_classes) combined_log_likelihood = estimate_utils.calc_full_log_likelihood(combined_block_pair_events, combined_node_membership, train_bp_mu, train_bp_alpha, train_bp_beta, combined_duration, num_classes, add_com_assig_log_prob=True) # Calculate log-likelihood given the train dataset train_log_likelihood = estimate_utils.calc_full_log_likelihood(train_block_pair_events, train_node_membership, train_bp_mu, train_bp_alpha, train_bp_beta, train_duration, num_classes, add_com_assig_log_prob=True) # Calculate per event log likelihood ll_per_event = model_utils.calc_per_event_log_likelihood(combined_log_likelihood, train_log_likelihood, test_event_dict, test_num_nodes) toc = time.time() lls_per_event.append(ll_per_event) # Print train and test log-likelihood per event train_n_events = np.sum(utils.event_dict_to_aggregated_adjacency(train_num_nodes, train_event_dict)) print(f"K: {num_classes} - Train ll: {train_log_likelihood / train_n_events:.4f}", end=' - ') print(f"Test ll: {ll_per_event:.3f} - Took: {toc - tic:.2f}s") # Save results result_file_path = f'{dataset_utils.get_script_path()}/storage/results/fb_bhm_fit' with open(f'{result_file_path}/k{num_classes}-model-params.pckl', 'wb') as handle: pickle.dump([train_node_membership, train_bp_mu, train_bp_alpha, train_bp_beta, train_block_pair_events], handle, protocol=pickle.HIGHEST_PROTOCOL) if plot_fitted_hist: estimate_utils.generate_fit_block_hawkes(train_event_dict, train_node_membership, train_bp_mu, train_bp_alpha, train_bp_beta, train_duration) total_toc = time.time() print(f"Total time elapsed: {total_toc - total_tic:.2f}s") return lls_per_event
def fit_community_model(event_dict, num_nodes, duration, num_classes, local_search_max_iter, local_search_n_cores, verbose=False): """ Fits CHIP model to a network. :param event_dict: Edge dictionary of events between all node pair. :param num_nodes: (int) Total number of nodes :param duration: (int) duration of the network :param num_classes: (int) number of blocks / classes :param local_search_max_iter: Maximum number of local search to be performed. If 0, no local search is done :param local_search_n_cores: Number of cores to parallelize local search. Only applicable if `local_search_max_iter` > 0 :param verbose: Prints fitted Block Hawkes parameters :return: node_membership, mu, alpha, beta, block_pair_events """ agg_adj = utils.event_dict_to_aggregated_adjacency(num_nodes, event_dict) # adj = utils.event_dict_to_adjacency(num_nodes, event_dict) # Running spectral clustering node_membership = spectral_cluster(agg_adj, num_classes, verbose=False, plot_eigenvalues=False) if local_search_max_iter > 0 and num_classes > 1: node_membership, bp_mu, bp_alpha, bp_beta = cls.chip_local_search( event_dict, num_classes, node_membership, duration, max_iter=local_search_max_iter, n_cores=local_search_n_cores, return_fitted_param=True, verbose=False) block_pair_events = utils.event_dict_to_block_pair_events( event_dict, node_membership, num_classes) else: (bp_mu, bp_alpha, bp_beta, bp_alpha_beta_ratio, block_pair_events) = estimate_bp_hawkes_params( event_dict, node_membership, duration, num_classes, agg_adj=agg_adj, return_block_pair_events=True) # Printing information about the fit if verbose: _, block_count = np.unique(node_membership, return_counts=True) class_prob = block_count / sum(block_count) print(f"Membership percentage: ", class_prob) print("Mu:") print(bp_mu) print("\nAlpha:") print(bp_alpha) print("\nBeta:") print(bp_beta) return node_membership, bp_mu, bp_alpha, bp_beta, block_pair_events