def plotResult(ds, save=True): df = pd.read_csv(getbase_dir('results') + ds + '.csv', sep=';') df = df[df.bf_type != 'BBLIP'] fig, ax = plt.subplots(figsize=(10, 6)) sns.boxplot(data=df, x='p', y='diff', hue='bf_type', showfliers=False) plt.title("") plt.ylabel("Diferença em %") # plt.show() fig.savefig(getbase_dir('results') + 'sns_bp_' + ds + '.png')
def readResultsSPF02(): df = pd.read_csv(getbase_dir(['results','sbf_02_data']) + 'msplit_' +'bikes' + '.csv', sep=';') df['ds'] = 'bike' df = df[df.bf_type != 'BBLIP'] for ds in ['beer', 'books1', 'eletronics', 'movies1', 'music', 'restaurants1']: bdf = pd.read_csv(getbase_dir(['results','sbf_02_data']) + 'msplit_' + ds + '.csv', sep=';') bdf['ds'] = ds bdf = bdf[bdf.bf_type != 'BBLIP'] df = pd.concat([df, bdf]) return df
def processMultisplit(datadir, basename, e1_fields, e2_fields, bflen): pool = mp.Pool(processes=4) # no maximo 8bits por split b = BloomFilter(cap=bflen) max_split = round(np.log2(b.bit_size)) - 2 ed = encrypt_data(datadir, basename, e1_fields, e2_fields, bflen, set_p=0.5) results = [ pool.apply_async(parallel_compare_multisplit, args=(ed, s)) for s in np.arange(1, max_split, 1) ] output = [p.get() for p in results] df = output[0] for pdf in output[1:]: df = df.append(pdf, ignore_index=True) # df['diff'] = abs(df.full - df.sbf_sim) * 100 # ax = df.boxplot('diff',by='p',rot=30) df.to_csv(getbase_dir('results') + "msplit_" + datadir + '.csv', sep=';') print('Done ' + datadir + "!")
def plot_all_ds_considering_split_number(df,dash_styltes): """ Equation 01 of section 2 :param df: :param dash_styltes: :return: """ sns.set_style("whitegrid") fig, ax = plt.subplots(figsize=(10, 6)) sns.lineplot(data=df, x='splits', y='mean_erro',hue='bf_type',style='similarity',dashes=dash_styles) ax.set_xscale('log') # ax.set_yscale('log') ax.xaxis.set_major_locator(ticker.LogLocator(base=2.0, subs=(1.0, ), numdecs=0, numticks=None)) ax.xaxis.set_major_formatter(ticker.ScalarFormatter()) # ax.axhline(0.01, ls='--') plt.title("Similarity Error") # plt.ylabel("Error (\u03B5)") plt.ylabel("Error") plt.xlabel("Number of splits") plt.show() fig.savefig(getbase_dir(['results','sbf_02b']) + "zz_all_ds_considering_split_number.png", dpi=300)
def exponential_regression2var(func_exp,x_data, y_data, xg, yg , eq_label=r'$f(x) = {:.2f} * ln( {:.2f} * x) + {:.2f}$'): # func_exp = q2 # x_data = X # y_data = y # xg = Xg fig = plt.gcf() popt, pcov = scipy.optimize.curve_fit(func_exp, x_data, y_data, p0 = (-1, 0.01, 0)) print(popt) puntos = plt.plot(xg, yg, 'x', color='xkcd:maroon', label = "data") y_predicted = func_exp(xg, *popt) rmse = np.sqrt(mean_squared_error(yg, y_predicted)) eq_label = eq_label.format(*popt) + ", rmse = {:.3f}".format(rmse) curva_regresion = plt.plot(x_data, func_exp(x_data, *popt), color='xkcd:teal', label = eq_label) # curva_regresion = plt.plot(x_data, func_exp(x_data, *popt), color='xkcd:teal', label=eq_label + end_label) plt.legend() plt.title("Estimated Error in SBF") plt.xlabel("$x=\\frac{s}{l}$") plt.ylabel('Error') plt.show() fig.savefig(getbase_dir(['results', 'sbf_02b']) + "new_estimated_sbf_erro.png", dpi=300) # plt.close() return popt
def plot_all_ds_considering_percent(df,dash_styltes): sns.set_style("whitegrid") fig, ax = plt.subplots(figsize=(10, 6)) # sns.lineplot(data=df, x='x', y='mean_dist_of_real',hue='ds',style='ds',dashes=dash_styles) sns.lineplot(data=df, x='x', y='mean_dist_of_real', hue='ds' , dashes=[(2, 2)]) # q1 = lambda p: np.exp(6.999 - .7903 * np.log(p)) # # sns.set_style("whitegrid") # q1 = lambda p: -.7903 * np.log(p) # fig, ax = plt.subplots(figsize=(10, 6)) # q2 = lambda p: -1*np.log(0.34*p) # q3 = lambda p: 1/(1 + np.log(p)) #-1.69314718/1 # q3 = lambda p: 1 / (1 + p * np.log(p)) # -1.69314718/1 # q3 = lambda p: 1 / (1 + p) # -1.69314718/1 # P = np.linspace(0.0, 0.5, num=10) # # print(P) # q3(P) # ax.plot(P, q2(P), color="BLUE", lw=3, label='Q2') # ax.plot(P, q1(P), color="RED", lw=3, label='Q1') # ax.plot(P, q3(P), color="GREEN", lw=3, label='Q1') # plt.show() #ax.set_xscale('log') # ax.xaxis.set_major_locator(ticker.LogLocator(base=2.0, subs=(1.0, ), numdecs=0, numticks=None)) ax.xaxis.set_major_locator(ticker.MultipleLocator(0.1)) ax.xaxis.set_minor_locator(ticker.MultipleLocator(0.05)) plt.title("SBF Split Error") plt.ylabel("Mean Error (\u03B5)") plt.xlabel("Split length in % of original filter") plt.show() fig.savefig(getbase_dir(['results','sbf_02b']) + "zz_all_ds_error_bit_percent.png",dpi=300)
def compileContract(file, lib=None, ldlib=None, file_path="Contracts"): file_path = getbase_dir(file_path) input_json = get_input_json(file_path, file, lib, ldlib) set_solc_version('v0.5.4') # return compile_files([file_path+file]) return compile_standard(input_json, allow_paths=file_path)
def plot_sbfError(df): sns.set(style="whitegrid") fig, ax = plt.subplots(figsize=(10, 6)) sns.boxplot(data=df, x='p', y='diff', hue='bf_type', showfliers=False) plt.title("SBF Error") plt.ylabel("Error in %") plt.show() fig.savefig(getbase_dir('results') + "sbf_error_all_ds" + ".png")
def encrypt_data2(datadir, basename, e1_fields, bflen, fp=0.01, ngrams=2, lpower=256, enc='utf-8', set_p=None): base_dir = getbase_dir(['Datasets', datadir]) # + os.sep rows = [] # print(base_dir+basename) with open(base_dir + basename, encoding=enc, errors='replace') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 for row in csv_reader: if line_count == 0: # print(f'Column names are {", ".join(row)}') line_count += 1 else: try: dbf1 = row[e1_fields[0]] for i in e1_fields[1:]: dbf1 = dbf1 + row[i] if set_p == None: erow = [ row[0], encryptData(dbf1, bflen, n=ngrams, fp=fp, bpower=lpower) ] else: erow = [ row[0], encryptData(dbf1, bflen, n=ngrams, fp=fp, bpower=lpower, p=set_p) ] rows.append(erow) line_count += 1 except IndexError: print(row) print(e1_fields) # print(f'Processed {line_count} lines.') return rows
def plot_erroInSBFParts(pdf): sns.set(style="whitegrid") fig, ax = plt.subplots(figsize=(10, 6)) sns.boxplot(data=pdf, x='ds', y='temp', hue='part', showfliers=False, notch=True) plt.title("SBF Error in splits") plt.ylabel("Error in %") plt.show() fig.savefig(getbase_dir('results') + "sbf_parts_error_.png")
def process(datadir, basename, e1_fields, e2_fields, bflen): pool = mp.Pool(processes=8) results = [ pool.apply_async(parallel_compare, args=(datadir, basename, e1_fields, e2_fields, bflen), kwds={'set_p': p}) for p in np.arange(0.1, 1.0, 0.1) ] output = [p.get() for p in results] df = output[0] for pdf in output[1:]: df = df.append(pdf, ignore_index=True) df['diff'] = abs(df.full - df.sbf_sim) * 100 # ax = df.boxplot('diff',by='p',rot=30) ax = df.boxplot('diff', by=['p', 'bf_type'], rot=90, figsize=(18, 10)) fig = ax.get_figure() plt.title("") plt.xlabel("") plt.ylabel("Diferença em %") fig.savefig(getbase_dir('results') + datadir + '.png') df.to_csv(getbase_dir('results') + datadir + '.csv', sep=';') print('Done ' + datadir + "!")
def plot_error_epsilon_distribution(z,marcas=[1, 3, 5, 6],fs=(12, 6)): # z = df[df.bf_type == 'BBF'] # z = df z['nd'] = (z.full - z.psim_mean)*100 # zz = z[z.x <= 0.2] # zz = zz[zz.x >= 0.1] #fig, ax = plt.subplots() #ax.violinplot(zz.nd, vert=True) #sns.distplot(zz.nd, fit=st.laplace, kde=False) # Show the plot #plt.show() # labels = list(z.x.unique()[[1, 2, 3, 4, 5, 6]]) labels = list(z.x.unique()[marcas]) fig, axes = plt.subplots(2, int(len(labels) / 2), figsize=fs, constrained_layout=True) # sharex=True) # fig.subplots_adjust(top=0.8) fig.suptitle("\u03B5-Error Distribution",y=1.05) colors = ['skyblue', 'olive', 'gold', 'purple', 'teal', 'red'] # labels = z.x.unique() for x in range(0, len(labels)): print(x) eixo = False if x < len(labels) / 2: eixo = axes[0, x] # axes[x] else: eixo = axes[1, int(x - len(labels) / 2)] eixo.set_title('Split of {:.2%}'.format(labels[x])) # sns.distplot(z[z.x == labels[x]].nd, fit=st.laplace , color=colors[x], sns.distplot(z[z.x == labels[x]].nd, fit=st.laplace, label='length={}'.format(x), kde=False, ax=eixo) # ax=axes[0,x]) for ax1 in axes.flat: # ax1.set(xlabel='x-label', ylabel='y-label') ax1.set(xlabel='error') plt.show() fig.savefig(getbase_dir(['results', 'sbf_02b']) + "zz_p_error_episilon.png", dpi=300)
def plot_episilon_approximation(a,b): a=-0.042876301194393125 b=3.2574724870013103 sns.set_style("whitegrid") fig, ax = plt.subplots(figsize=(5, 4)) fe = lambda x, a , b: -1 * np.log(a * np.log(b * x)) #eq completa fe1 = lambda x, a, b: np.log( 1/ (a * np.log(b * x)) ) simplificada_01 = lambda x, a, b: np.log(a) - np.log(b * x) as1 = lambda x: ( np.log(1/np.log(1/x)) ) + 3.7 # aqui # as2 = lambda x: np.log(1/np.log(1 / x)) # as2 = lambda x: np.log(1 / np.log(x)) # as2 = lambda x,a: np.log(-1*np.abs(a)*np.log(1/x)) # as2(p) # as2 = lambda x,a: 1 / np.log(a*np.log(1 / x)) # as3 = lambda x, b: 1 / np.log( (np.log(1/b)+np.log(1 / x)) ) # assintotico = lambda x: -1 / np.log(x) p = np.linspace(0.00001, .25, num=50) data = [] for xp in p: data.append((xp,fe(xp, a, b),r'$ln(\frac{1}{a * ln(b*x)})$')) # data.append((xp, fe(xp, -1, 1), r'$ln1(\frac{1}{a * ln(b*x)})$')) # "$\\frac{1}{a * ln(b*x)}$")) # data.append((xp,fe1(xp, a, b), 'c1')) # data.append((xp, simplificada_01(xp, 1, b), 's1')) data.append((xp, as1(xp), r'$ln(\frac{1}{ln(\frac{1}{x})})+ c , c=2$')) # data.append((xp, as2(xp,-1*a), 'as2')) # data.append((xp, as2(xp,a), 'as2')) # data.append((xp, as3(xp, b), 'as3')) labels = ['x', 'y', 'function'] r = pd.DataFrame.from_records(data, columns=labels) sns.lineplot(data=r,x='x',y='y',hue='function') plt.title("$\\epsilon\ estimation$") plt.xlabel("splits size$(\\frac{s}{l})$") plt.ylabel(r'$\epsilon$') plt.show() plt.close() fig.savefig(getbase_dir(['results', 'sbf_02b']) + "episilon_estimation.png", dpi=400)
def plot_summaryByDataset(rdf): #jitter x def f(g): return np.random.normal(g, 0.03) def g(x): return abs(np.random.normal(g, 0.03)) rdf.p = rdf.np.apply(f) # rdf['median_error'] = rdf['median_error'].apply(g) # cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True) sns.set(style="whitegrid") fig, ax = plt.subplots(figsize=(10, 6)) ax = sns.scatterplot(x="p", y="median_error", hue="bf_type", alpha=0.8, x_jitter=True, s=150, style='dataset', palette="Set2", data=rdf) handles, labels = ax.get_legend_handles_labels() lgd = ax.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.9, -0.1), ncol=3) # ax.legend(frameon=True, loc='lower center', ncol=4) # text = ax.text(-0.2,1.05, "Aribitrary text", transform=ax.transAxes) import matplotlib.ticker as ticker ax.xaxis.set_major_locator(ticker.MultipleLocator(0.1)) ax.set_title("SBF Error") ax.set_ylabel("Median error in %") ax.grid('on') plt.tight_layout() plt.show() fig.savefig(getbase_dir('results') + 'erro_in_all_ds_.png')
b = BloomFilter(cap=96) #gabarito bases = ['bikes','beer', 'books1', 'eletronics', 'movies1', 'music', 'restaurants1'] df = df[df.id_a != 'ltable._id'] df = df[df.id_b != 'rtable._id'] df = df.round(2) for datadir in df.ds.unique(): dsg = df[(df.ds == datadir) & (df.bf_type == 'BBF')] base_dir = getbase_dir(['Datasets', datadir ]) # + os.sep gab_files = base_dir + 'labeled_data.csv' print(gab_files) dsg.id_a = pd.to_numeric(dsg.id_a) dsg.id_b = pd.to_numeric(dsg.id_b) gs = pd.read_csv(gab_files,skiprows=5) r0 = [] r1 = [] for index, row in dsg.iterrows(): aid = row.id_a bid = row.id_b if len(gs[(gs['ltable._id'] == aid) & (gs['rtable._id'] == bid) & (gs.gold == 1)]) == 1: s1 = row.sbf_sim
###################################################################################################### df['py'] = abs(df.full - df.psim_median) sns.set(style="whitegrid") fig, ax = plt.subplots(figsize=(10, 6)) sns.boxplot(data=df, x='splits', y='py') #ax.set_xscale('log') #ax.set_yscale('log') #plt.axvline(7, 0.05 ,3,color='red') #ax.set(xscale="log") ax.yaxis.set_major_locator(ticker.MultipleLocator(0.1)) plt.title("SBF Split Error in ") plt.ylabel("Error") plt.show() fig.savefig(getbase_dir('results') + "zz_erro_incease_split.png") sns.set(style="whitegrid") fig, ax = plt.subplots(figsize=(10, 6)) sns.lineplot(data=df.head(50000), x='bits', y='median_dist_of_real', hue='ds', style='ds', dashes=dash_styles) ax.set_xscale('log') # ax.set_xticklabels(rotation=30) ax.xaxis.set_minor_formatter(ticker.ScalarFormatter()) #ax.get_xaxis().get_major_formatter().set_scientific(False) #ax.get_xaxis().get_major_formatter().set_useOffset(False) plt.xticks(2**np.arange(10, dtype=np.uint64))