def test_save_and_load(): path = "save_and_load_test.json" z1 = Zipf({"one": 0.2, "two": 0.25, "three": 0.6}) z1.save(path) z2 = Zipf.load(path) os.remove(path) assert z1 == z2
def _plot_pareto_pdf(ax, alpha, xmin, degrees, max_degree, hist=False): degree_freq = [] # p= Pareto2(alpha, xmin) p = Zipf(alpha, max_degree, xmin) n = 0 for k in degrees: if k >= xmin: n += 1 total = 0 xrange = range(xmin, max_degree, 1) for k in xrange: pdf = p.pdf(k) if not (hist): degree_freq.append(pdf) else: # degree_freq.append(total+pdf) degree_freq.append(p.cdf(k)) total += pdf degree_freq = [i * n / total for i in degree_freq] if not (hist): ax.plot(xrange, degree_freq) else: ax.plot([i - 0.5 for i in xrange], degree_freq, "v") return
def test_sort(): z = Zipf({"one": 0.2, "two": 0.25, "three": 0.6}) assert ( Zipf({"three": 0.6, "two": 0.25, "one": 0.2}) ) == ( z.sort() )
def test_mean(): z = Zipf({"one":0.2, "two":0.25, "three":0.6}) assert ( z.mean() ) == ( 0.35 )
def test_neg(): z = Zipf({"one": 1, "two": 0.5}) assert ( z, Zipf({"two": -0.5, "one": -1}) ) == ( (-(-z)).sort(), (-z).sort() )
def test_mul(): z = Zipf({"one": 0.75, "two": 0.5}) z2 = Zipf({"one": 1.0, "two": 1.0}) z3 = Zipf({"one": 1.0}) assert ( Zipf({ "one": 1.5, "two": 1.0 }), z, Zipf({"one": 0.75}), ) == (z * 2, (z * z2 * z2 * z2).sort(), (z * z3 * z3 * z3).sort())
def test_str(): z = Zipf({"g": 1}) assert ( str(z), str(Zipf()), str(z), str(Zipf()) ) == ( '{\n "g": 1\n}', '{}', repr(z), repr(Zipf()) )
def test_ksequence_factory(): errors = [] try: ZipfFromKSequence(-3) errors.append("ZipfFromKSequence should fail with k less than zero") except Exception as e: pass k = 5 factory = ZipfFromKSequence(k) current_path = os.path.dirname(__file__)+"/factory_utils" zipf = Zipf.load( current_path+"/expected_results/sequence.json").sort().round() with open(current_path+"/sequence/sequence.txt", "r") as f: sequence = f.read() factory_run = factory.run(sequence).round() if factory_run != zipf: errors.append( "Sequence zipf run is different than expected: %s != %s" % (zipf, factory_run)) assert not errors, "errors occured:\n{}".format("\n".join(errors))
def test_check_empty(): z = Zipf({"one": 0.2, "two": 0.25, "three": 0.6}) raised = False try: Zipf().check_empty() except ValueError as e: raised = True try: z.check_empty() except ValueError as e: raised = False assert raised == True
def get_data(q, path, test_size=0.3, random_state=42): df = getData(path) z = Zipf(q=q) z.fit(df) z.transform() z.filter_by_language() X, y = z['Text'], z['Category'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state) return X_train, X_test, y_train, y_test, z.dataframe
def test_truediv(): z = Zipf({"one": 1, "two": 0.5}) try: r = z / 0 assert False except ValueError as e: pass z2 = Zipf({"one": 1, "two": 1}) z3 = Zipf({"one": 1}) assert ( Zipf({ "one": 0.5, "two": 0.25 }), z, Zipf({"one": 1}), ) == (z / 2, (z / z2).sort(), (z / z3).sort())
def fail_empty(lamb): errors = [] try: lamb(Zipf()) errors.append("Set has not raised exception with empty Zipf") except ValueError as e: pass return errors
def _fail_types(lamb, tests): z = Zipf() errors = [] for test in tests: try: result = lamb(z, test) errors.append("Set has not raised exception with value %s" % fail_test) except ValueError as e: pass return errors
def test_normalize(): z1 = Zipf({"one": 3, "two": 1}) z2 = Zipf({"one": 1}) assert ( z1.normalize(), z2.normalize() ) == ( Zipf({"one": 0.75, "two": 0.25}), z2 )
def _bipartite_op_stats_flat(g, output=None): # Vertex/edge count info(" Number of vertices: %d" % (g.vcount())) info(" Number of edges: %d" % (g.ecount())) degrees = g.degree() max_degree = g.maxdegree() info(" Highest degree: %d" % (max_degree)) # Connectedness if g.is_connected(): info(" Graph is connected") else: info(" Graph is not connected") clusters = g.clusters() info(" Number of connected components = %d" % (len(clusters))) giant = clusters.giant() info(" Size of largest component = %d" % (giant.vcount())) # Power law exponent estimators # mle= Pareto.mle(degrees) mle = Zipf.mle(degrees) info(" Maximum Likelihood Estimator:") info(" alpha (xm=1): %.20f" % (mle)) # Node degrees distributions if output: fig = plt.figure() ax = plt.subplot(1, 1, 1) _plot_degree_hist(ax, degrees, max_degree) filename = "degree-hist-%s" % (output) info(' Create "%s"' % (filename)) fig.savefig(filename) fig = plt.figure() ax = plt.subplot(1, 1, 1) _plot_degree_dist_loglog(ax, degrees, max_degree) filename = "degree-dist-loglog-%s" % (output) info(' Create "%s"' % (filename)) fig.savefig(filename) else: fig = plt.figure() ax = plt.subplot(1, 2, 1) _plot_degree_hist(ax, degrees, max_degree) ax = plt.subplot(1, 2, 2) _plot_degree_dist_loglog(ax, degrees, max_degree) plt.show() return g
def test_add(): z = Zipf({"one": 0.5, "two": 0.4}) z2 = Zipf({"one": 0.25, "four": 0.25}) z3 = Zipf({"three": 1}) assert ( Zipf({"one": 1, "two": 0.8}), Zipf(), Zipf({"one": 0.75, "two": 0.4, "four": 0.25}), Zipf({"three": 1, "one": 0.75, "two": 0.4, "four": 0.25}), ) == ( (z+z).sort(), (z+(-z)).sort(), (z+z2).sort(), (z+z2+z3).sort() )
def factory_fails(Factory, path, prepare=None, run=None): if prepare is None: prepare = Factory if run is None: def run(factory, data): return factory.run(data) current_path = os.path.dirname(__file__) errors = factory_break_options(Factory) global _options_for_tests tests = ["default", "empty"] + list(_options_for_tests.keys()) for test in tests: data_path_name = map_test_to_data(test) data_path_json = os.path.join(current_path, "%s/%s.json" % (path, data_path_name)) data_path_text = os.path.join(current_path, "%s/%s.txt" % (path, data_path_name)) result_path = os.path.join(current_path, "expected_results/%s.json" % test) if os.path.isfile(data_path_json): with open(data_path_json, "r") as f: data = json.load(f) elif os.path.isfile(data_path_text): with open(data_path_text, "r") as f: data = f.read() else: pytest.skip( "While testing %s, data for testing '%s' was not found in %s or %s." % (Factory.__name__, test, data_path_text, data_path_json)) if not os.path.isfile(result_path): pytest.skip( "While testing %s, result for testing '%s' was not found in %s." % (Factory.__name__, test, result_path)) result = Zipf.load(result_path).sort().round() factory = prepare(options=_get_options_for(test)) factory_run = run(factory, data).sort().round() if result != factory_run: errors.append( "%s has not expected result on run test '%s': %s != %s" % (Factory.__name__, test, result, factory_run)) return errors
def bipartite_op_gen_zipf(g, params): if len(params) != 8: error('invalid number of arguments for "genzipf"') zipf_alpha2 = float(params[0]) zipf_N2 = int(params[1]) zipf_xmin2 = int(params[2]) n2 = int(params[3]) zipf_alpha3 = float(params[4]) zipf_N3 = int(params[5]) zipf_xmin3 = int(params[6]) n3 = int(params[7]) z2 = Zipf(zipf_alpha2, zipf_N2, zipf_xmin2) z3 = Zipf(zipf_alpha3, zipf_N3, zipf_xmin3) z2_exp = z2.expectation() z3_exp = z3.expectation() info(" L2 distrib expectation = %.20f" % (z2_exp)) info(" L3 distrib expectation = %.20f" % (z3_exp)) if (n2 <= 0) and (n3 > 0): n2 = int(round((n3 * z3_exp) / z2_exp)) info(" n2 auto-computed = %d" % (n2)) elif (n2 > 0) and (n3 <= 0): n3 = int(round((n2 * z2_exp) / z3_exp)) info(" n3 auto-computed = %d" % (n3)) else: error(" n2 and n3 cannot both be undefined") info(" a2=%f, N2=%d, xmin2=%d, n2=%d" % (zipf_alpha2, zipf_N2, zipf_xmin2, n2)) info(" a3=%f, N3=%d, xmin3=%d, n3=%d" % (zipf_alpha3, zipf_N3, zipf_xmin3, n3)) # Check expected number of edges at level 2 # against expected number of edges at level 3 l2_exp = z2_exp * n2 l3_exp = z3_exp * n3 info(" Expected total L2 degree = %.20f" % (l2_exp)) info(" Expected total L3 degree = %.20f" % (l3_exp)) (degrees2, degrees3) = random_matching_degrees(z2, n2, z3, n3) log(1, " MLE L2 = %f" % (Zipf.mle(degrees2, zipf_xmin2))) log(1, " MLE L3 = %f" % (Zipf.mle(degrees3, zipf_xmin3))) return bipartite_cm(degrees2, degrees3)
def test_sub(): z = Zipf({"one": 0.5, "two": 0.5}) z2 = Zipf({"one": 0.25, "four": 0.25}) z3 = Zipf({"three": 1}) assert ( Zipf(), Zipf({ "two": 0.5, "one": 0.25, "four": -0.25 }), Zipf({ "two": 0.5, "one": 0.25, "four": -0.25, "three": -1 }), ) == (z - z, (z - z2).sort(), (z - z2 - z3).sort())
def test_var(): z = Zipf({"one": 0.2, "two": 0.25, "three": 0.6}) assert (z.var()) == (0.03166666666667)
def _bipartite_op_stats_bipartite(g, output=None): # Vertex/edge count l2_vertices = [idx for idx in range(g.vcount()) if not (g.vs["type"][idx])] l3_vertices = [idx for idx in range(g.vcount()) if g.vs["type"][idx]] info(" Number of L2 vertices: %d" % (len(l2_vertices))) info(" Number of L3 vertices: %d" % (len(l3_vertices))) info(" Number of edges: %d" % (g.ecount())) l2_degrees = g.degree(l2_vertices) l3_degrees = g.degree(l3_vertices) l3_degrees = [i for i in l3_degrees if i > 0] max_l2_degree = g.maxdegree(l2_vertices) max_l3_degree = g.maxdegree(l3_vertices) if max_l2_degree > max_l3_degree: max_degree = max_l2_degree else: max_degree = max_l3_degree info(" Highest L2 degree: %d" % (max_l2_degree)) info(" Highest L3 degree: %d" % (max_l3_degree)) l2_degrees_hist = numpy.histogram(l2_degrees, bins=max_degree, range=(0, max_degree)) l3_degrees_hist = numpy.histogram(l3_degrees, bins=max_degree, range=(0, max_degree)) # Connectedness if g.is_connected(): info(" Graph is connected") else: info(" Graph is not connected") clusters = g.clusters() info(" Number of connected components = %d" % (len(clusters))) giant = clusters.giant() info(" Size of largest component = %d" % (giant.vcount())) # L2/L3 power law exponents estimators info(" Maximum Likelihood Estimator:") l2_xmin_range = range(1, 4) l2_mle = _estimate_pareto_params(l2_degrees, l2_xmin_range) for xmin in l2_xmin_range: info(" alpha (L2, xm=%d): %.20f" % (xmin, l2_mle[xmin])) # l3_mle= Pareto.mle(l3_degrees) l3_mle = Zipf.mle(l3_degrees) info(" alpha (L3, xm=1): %.20f" % (l3_mle)) # L2/L3 degrees distributions if output: fig = plt.figure() ax = plt.subplot(1, 1, 1) _plot_degree_hist(ax, l2_degrees, max_degree) for xmin in l2_xmin_range: _plot_pareto_pdf(ax, l2_mle[xmin], xmin, l2_degrees, max_degree, hist=False) filename = "l2-degree-hist-%s" % (output) info(' Create "%s"' % (filename)) fig.savefig(filename) fig = plt.figure() ax = plt.subplot(1, 1, 1) _plot_degree_dist_loglog(ax, l2_degrees, max_degree) for xmin in l2_xmin_range: _plot_pareto_pdf(ax, l2_mle[xmin], xmin, l2_degrees, max_degree) filename = "l2-degree-dist-loglog-%s" % (output) info(' Create "%s"' % (filename)) fig.savefig(filename) fig = plt.figure() ax = plt.subplot(1, 1, 1) _plot_degree_hist(ax, l3_degrees, max_degree) _plot_pareto_pdf(ax, l3_mle, 1, l3_degrees, max_degree, hist=False) filename = "l3-degree-hist-%s" % (output) info(' Create "%s"' % (filename)) fig.savefig(filename) fig = plt.figure() ax = plt.subplot(1, 1, 1) _plot_degree_dist_loglog(ax, l3_degrees, max_degree) _plot_pareto_pdf(ax, l3_mle, 1, l3_degrees, max_degree) filename = "l3-degree-dist-loglog-%s" % (output) info(' Create "%s"' % (filename)) fig.savefig(filename) else: fig = plt.figure() ax = plt.subplot(2, 2, 1) _plot_degree_hist(ax, l2_degrees, max_degree) ax = plt.subplot(2, 2, 2) _plot_degree_dist_loglog(ax, l2_degrees, max_degree) for xmin in l2_xmin_range: _plot_pareto_pdf(ax, l2_mle[xmin], xmin, l2_degrees, max_degree) ax = plt.subplot(2, 2, 3) _plot_degree_hist(ax, l3_degrees, max_degree) ax = plt.subplot(2, 2, 4) _plot_degree_dist_loglog(ax, l3_degrees, max_degree) _plot_pareto_pdf(ax, l3_mle, 1, l3_degrees, max_degree) plt.show() return g
def test_remap(): z = Zipf({"one": 0.2, "two": 0.25, "three": 0.6}) remapper = Zipf({"three": 0.7, "one": 0.2}) assert Zipf({"three": 0.6, "one": 0.2}) == z.remap(remapper)
def test_setitem(): z = Zipf() z["my_key"] = 0.5 assert z == Zipf({"my_key": 0.5})
def test_getitem(): z = Zipf({"g": 1}) assert (z.__getitem__("g"), z["no_key"], z.__getitem__("g")) == (1, 0, z["g"])
def test_missing(): z = Zipf() assert z.__missing__("my_missing_key") == 0
def test_min(): z = Zipf({"one": 0.2, "two": 0.25, "three": 0.6}) assert (z.min()) == ("one")
def _estimate_pareto_params(degrees, beta_range): alphas = {} for beta in beta_range: # alphas[beta]= Pareto.mle(degrees, beta) alphas[beta] = Zipf.mle(degrees, beta) return alphas
def test_cut(): z = Zipf({"one": 0.2, "two": 0.25, "three": 0.6}) assert (z.cut(0.2), z.cut(0.25), z.cut(0.6), z.cut(0, 0.6), z.cut(0.2, 0.6), z.cut(0.25, 0.6), z.cut(0.2, 0.25)) == (Zipf({ "two": 0.25, "three": 0.6 }), Zipf({"three": 0.6}), Zipf(), z, Zipf({ "two": 0.25, "three": 0.6 }), Zipf({"three": 0.6}), Zipf({"two": 0.25}))