def mi(x, y, bins=10): """Mutual information between x and y""" H_x = u.compute_entropy(np.histogram(x, bins)[0]) H_y = u.compute_entropy(np.histogram(y, bins)[0]) c_xy = np.histogram2d(x, y, bins)[0] mi = skm.mutual_info_score(None, None, contingency=c_xy) return mi / np.sqrt(H_x * H_y)
def get_branch_lengths_estimates(tree): """ :param tree: Tree node or tree file or newick tree string; :return: """ # TBL branches = get_branch_lengths(tree) entropy = compute_entropy(branches) return max(branches), min(branches), np.mean(branches), np.std(branches), entropy
def get_diameters_estimates(tree_filepath, actual_bl=True): """ if not actual_bl - function changes the tree! send only filepath :param tree_filepath: tree file or newick tree string; :param actual_bl: True to sum actual dists, False for num of branches :return: min, max, mean, and std of tree diameters """ # tree = copy.deepcopy(get_newick_tree(tree)) # do not deepcopy! when trees are large it exceeds recursion depth if not actual_bl: assert isinstance(tree_filepath, str) tree = get_newick_tree(tree_filepath) tree_root = tree.get_tree_root() if not actual_bl: for node in tree_root.iter_descendants(): node.dist = 1.0 tree_diams = [] leaves = list(tree_root.iter_leaves()) for leaf1, leaf2 in itertools.combinations(leaves, 2): tree_diams.append(leaf1.get_distance(leaf2)) entropy = compute_entropy(tree_diams) return max(tree_diams), min(tree_diams), np.mean(tree_diams), np.std(tree_diams), entropy
def time_entropy(visits): """Compute entropy of venue with respect to time of the day of its checkins.""" hours = np.bincount([t.hour for t in visits], minlength=24) return u.compute_entropy(hours.astype(float))/np.log(24.0)
def venue_entropy(visitors): """Compute the entropy of venue given the list of its `visitors`.""" # pylint: disable=E1101 return u.compute_entropy(np.array(Counter(visitors).values(), dtype=float))
def jensen_shannon_divergence(P, Q): """Compute JSD(P || Q) as defined in https://en.wikipedia.org/wiki/Jensen–Shannon_divergence """ avg = 0.5*(P + Q) avg_entropy = 0.5*(u.compute_entropy(P) + u.compute_entropy(Q)) return u.compute_entropy(avg) - avg_entropy
def jensen_shannon_divergence(P, Q): """Compute JSD(P || Q) as defined in https://en.wikipedia.org/wiki/Jensen–Shannon_divergence """ avg = 0.5 * (P + Q) avg_entropy = 0.5 * (u.compute_entropy(P) + u.compute_entropy(Q)) return u.compute_entropy(avg) - avg_entropy
def make_bias_noise_figure(bias_cp_dfs, entropy_cp_dfs, noisy_cp_dfs, noise_levels, noiseless_cp_df, analytical_cp_df, bias_cp_df_labels=None, entropy_cp_df_labels=None, base_figure_scale=3, include_binomial_null=False): num_noisy_cp_dfs = len(noisy_cp_dfs) normal_axis_scale = 5 small_axis_scale = 2 gap_scale = 1 figsize = (num_noisy_cp_dfs * base_figure_scale, 2 * base_figure_scale) gridsize = [ normal_axis_scale * 2 + small_axis_scale + gap_scale, num_noisy_cp_dfs * normal_axis_scale + (num_noisy_cp_dfs - 1) * gap_scale ] f, ax_grid = plt.subplots(*gridsize, figsize=figsize) bias_ax = plt.subplot2grid(gridsize, (0, 0), colspan=gridsize[-1] - normal_axis_scale - gap_scale, rowspan=normal_axis_scale) counts = [ bias_cp_df.groupby("index_set").count().final_loss for bias_cp_df in bias_cp_dfs ] marginal_counts = [ utils.compute_marginal_counts(count) for count in counts ] panels.make_bias_panel(bias_cp_dfs, counts, marginal_counts, labels=bias_cp_df_labels, ax=bias_ax, include_binomial_null=include_binomial_null) entropy_ax = plt.subplot2grid(gridsize, (0, gridsize[-1] - normal_axis_scale), rowspan=normal_axis_scale, colspan=normal_axis_scale) entropies = [utils.compute_entropy(cp_df) for cp_df in entropy_cp_dfs] entropy_sds = [ utils.bootstrap_entropy_sd(cp_df) for cp_df in entropy_cp_dfs ] panels.make_entropy_panel(entropy_cp_dfs, entropies, entropy_sds, entropy_cp_df_labels, ax=entropy_ax) noise_axs = [] for ii, (noisy_cp_df, noise_level) in enumerate(zip(noisy_cp_dfs, noise_levels)): if not noise_axs == []: subplot2grid_kwargs = { "sharex": noise_axs[0], "sharey": noise_axs[0] } else: subplot2grid_kwargs = {} noise_ax = plt.subplot2grid(gridsize, (normal_axis_scale + gap_scale, (normal_axis_scale + gap_scale) * ii), rowspan=normal_axis_scale, colspan=normal_axis_scale, **subplot2grid_kwargs) include_x_label = False if ii == 0: include_y_label = True else: include_y_label = False if ii + 1 == num_noisy_cp_dfs: include_legend = True else: include_legend = False panels.make_noise_comparison_panel(noisy_cp_df, analytical_cp_df, noiseless_cp_df, ax=noise_ax, include_legend=include_legend, include_x_label=include_x_label, include_y_label=include_y_label, noise_level=noise_level) if subplot2grid_kwargs != {}: subplot2grid_kwargs = {"sharex": noise_axs[0]} histogram_ax = plt.subplot2grid(gridsize, (normal_axis_scale * 2 + gap_scale, (normal_axis_scale + gap_scale) * ii), rowspan=small_axis_scale, colspan=normal_axis_scale, **subplot2grid_kwargs) cps = [noisy_cp_df, analytical_cp_df] colors = [NUMERICAL_CPS_COLOR, ANALYTICAL_CPS_COLOR] panels.make_histogram_comparison_panel(cps, colors, ax=histogram_ax, include_x_label=True, include_y_label=include_y_label) noise_axs.append(noise_ax) return f, ax_grid, bias_ax, noise_axs