def instance2features(instance): features = [] if settings.use_N_DIM: features.append(instance.dim) features.append(len(instance.points)) for dist in settings.distances: distances, mean, dev = dist_util.distance(instance, dist) features.append(mean) features.append(dev) for test in settings.modality_tests: if test == 'dip': out = diptest(distances) if 'dip_stat' in settings.modality_tests[test]: features.append(out[0]) if 'p_value' in settings.modality_tests[test]: features.append(out[1]) if test == 'silverman': if 'p_value' in settings.modality_tests[test]: out = modality.silverman_bwtest(np.random.choice( distances, 250), alpha=0.05) assert isinstance(out, float) features.append(out) for feat in settings.additional_statistics: if feat == 'hopkins': features.append(hopkins(instance.points)) return features
def dip(peak, smooth=True): """ Run a dip test. The diptest can be used to test if a distribution is unimodal. In order to get it to work, I have to turn the peak signal into a distribution by simulating, and then run the test on the simulated data. This is a little hackish, there is probably a better/faster way. """ # Smooth distribution using hamming if smooth: smooth = signal.convolve(peak, signal.hamming(10)) else: smooth = peak # Set up x's x_grid = np.arange(0, smooth.shape[0]) # Normalize the peak section to sum to 1 norm = smooth / smooth.sum() # Simulate data from the peak distribution sim = choice(x_grid, size=3000, replace=True, p=norm) # Run diptest test, pval = diptest(sim) return test, pval
def generate_scatter_dist_plot(articles, num_ideas, plot_dir, prefix, cooccur_func=None, make_plots=True, write_tests=True, group_by="year", samples=1000): result = get_count_cooccur(articles, func=cooccur_func) pmi = get_pmi(result["cooccur"], result["count"], float(result["articles"]), num_ideas=num_ideas) articles_group = get_time_grouped_articles(articles, group_by=group_by) info_dict = { k: get_count_cooccur(articles_group[k], func=cooccur_func) for k in articles_group } ts_correlation = get_ts_correlation(info_dict, num_ideas, normalize=True) xs, ys = [], [] for i in range(num_ideas): for j in range(i + 1, num_ideas): if np.isnan(pmi[i, j]) or np.isnan(ts_correlation[i, j]): continue if np.isinf(pmi[i, j]) or np.isinf(ts_correlation[i, j]): continue xs.append(ts_correlation[i, j]) ys.append(pmi[i, j]) if write_tests: with open("%s/%s_test.jsonlist" % (plot_dir, prefix), "w") as fout: k, p = ss.mstats.normaltest(xs) fout.write("%s\n" % json.dumps({ "name": "correlation normality test", "k2": None if np.ma.is_masked(k) else k, "p-value": p })) k, p = ss.mstats.normaltest(ys) fout.write("%s\n" % json.dumps({ "name": "PMI normality test", "k2": None if np.ma.is_masked(k) else k, "p-value": p })) if unimodality_test: d, p = diptest.diptest(np.array(xs)) fout.write("%s\n" % json.dumps({ "name": "correlation unimodality test", "d": None if np.ma.is_masked(k) else d, "p-value": p })) d, p = diptest.diptest(np.array(ys)) fout.write("%s\n" % json.dumps({ "name": "PMI unimodality test", "d": None if np.ma.is_masked(k) else d, "p-value": p })) c, p = ss.pearsonr(xs, ys) fout.write("%s\n" % json.dumps({ "name": "correlation between correlation and PMI", "coef": c, "p-value": p })) filename = "%s/%s_joint_plot.pdf" % (plot_dir, prefix) if make_plots: fig = pf.joint_plot(np.array(xs), np.array(ys), xlabel="prevalence correlation", ylabel="cooccurrence", xlim=(-1, 1)) pf.savefig(fig, filename) return pmi, ts_correlation, filename
from src.gen import break_gen from src.dist_util import distance from diptest.diptest import diptest import sys import matplotlib.pyplot as plt best = 0 other = 'minkowski' for i in range(100000): inst = break_gen() distances_euclid, mean_euclid, dev_euclid = distance(inst, 'eucld') out = diptest(distances_euclid) distances_other, mean_other, dev_other = distance(inst, other) out2 = diptest(distances_other) #print(len(distances_euclid), len(distances_other)) cur = (out2[1] - out[1]) if cur < best: print(out, out2) plt.title("p values --- Euclid = " + str(round(out[1], 2)) + ", " + other + " = " + str(round(out2[1], 2))) plt.scatter(inst.points[:, 0], inst.points[:, 1]) plt.show() plt.hist([distances_euclid, distances_other], label=['euclid', other], bins=30) plt.legend() plt.title("p values --- Euclid = " + str(round(out[1], 2)) + ", " + other + " = " + str(round(out2[1], 2))) plt.show() best = min(best, cur)
def ackerman_dist(instance): distances = distance(instance) if len(distances) > 70000: distances = np.random.choice(distances, 70000) out = diptest(distances) return out[1] < ackerman_cutoff, out[1]