def IQR(file_name, path_to_file_name, is_Simul, isMax, max_number_or_var, min_number): df, df_cumul_sum, list_of_dfs = prepare_data( path_to_file_name) #bring necessary data in a proper format df_removed = df #in future here will be stored adjsted dataframe outliers_list = [] #outliers based on final cumul sum number_of_outliers = 0 for index in range(len(list_of_dfs)): #go through dataframe of each page Q1 = list_of_dfs[index].quantile(0.25) Q3 = list_of_dfs[index].quantile(0.75) IQR = Q3 - Q1 #df_checked = (list_of_dfs[index] < (Q1 - 1.5 * IQR)) | (list_of_dfs[index] > (Q3 + 1.5 * IQR)) df_outliers = list_of_dfs[index][( (list_of_dfs[index] < (Q1 - 1.5 * IQR)) | (list_of_dfs[index] > (Q3 + 1.5 * IQR)))] #dataframe of outliers #print(df_outliers) df_outliers, isMax, max_number_or_var, number_of_outliers, df_removed, outliers_list = upper_bound_check( is_Simul, df_outliers, isMax, max_number_or_var, number_of_outliers, df_removed, outliers_list) # print(outliers_list) df_removed, list_of_random_outliers = upper_rand_check( outliers_list, df, max_number_or_var) if (is_Simul == False): WF.write_csv(df_removed, "iqr_removed_" + file_name) print("Outliers based on IQR based on final CUMUL_SUM:") # print(df) # print(df_removed) print_comparison("IQR_CUMUL_SUM", path_to_file_name, outliers_list, number_of_outliers, df, df_removed, len(df_cumul_sum.index)) #print information
def data_preparation(): #print(SUP.read_urls()) urls_list = SUP.read_urls() joined_df = pd.DataFrame() for index, url in enumerate(urls_list): file_name = SUP.multi_files_names( url ) #call multi_files_names() function to get specific file name for Marged Data path_to_file_name = SUP.merge_file_path( file_name ) #call merge_file_path() function to enter into the directory of "file_name" df, df_cumul_sum, list_of_dfs = prepare_data( path_to_file_name) #bring necessary data in a proper format df = df.transpose() df.loc[0:40, 'site'] = url joined_df = joined_df.append(df) #ignore_index=True #print(df) # if index == 1: # break print(joined_df) WF.write_csv(joined_df, "joined_df" + file_name) return joined_df
def z_score(file_name, path_to_file_name, is_Simul, isMax, max_number_or_var, min_number): df, df_cumul_sum, list_of_dfs = prepare_data( path_to_file_name) #bring necessary data in a proper format df_removed = df #in future here will be stored adjsted dataframe outliers_list = [] #outliers based on final cumul sum threshold = 1.8 number_of_outliers = 0 for index in range(len(list_of_dfs)): #go through dataframe of each page #print(list_of_dfs[index]) mean = np.mean(list_of_dfs[index]) #mean from instances of one page std = np.std(list_of_dfs[index]) #std from instances of one page df_outliers = list_of_dfs[index][ np.abs(list_of_dfs[index] - mean) / std > threshold] #dataframe of outliers based on z = (X — μ) / σ df_outliers, isMax, max_number_or_var, number_of_outliers, df_removed, outliers_list = upper_bound_check( is_Simul, df_outliers, isMax, max_number_or_var, number_of_outliers, df_removed, outliers_list) if (is_Simul): WF.write_csv(df_removed, "z-score_removed_" + file_name) print("Outliers based on Z-score based on final CUMUL_SUM:") print_comparison("Z-score_CUMUL_SUM", path_to_file_name, outliers_list, number_of_outliers, df, df_removed, len(df_cumul_sum.index)) #print information
def find_callfor_IQR(URL, path_to_file_name, is_Simul): #call multi_files_names() function to get specific file name for Marged Data file_name = search_marge_files_names(URL) #call merge_file_path() function to enter into the directory of "file_name" #for find out the file. file_name1 = merge_file_path(file_name) #print(file_name) df = pd.read_csv(file_name1) df_removed = df #in future here will be stored adjsted dataframe header_list = list(df.columns) outliers_list = [] page_outliers_dict = {} for i in header_list[1:41]: outliers = IQR.detect_outliers_with_IQR(df[i]) if len(outliers) > 0: page_outliers_dict[i] = "1" #outliers_list.append(i) df_removed = df_removed.drop(columns=i) #remove outlier's column #print(f"outliers cal. with IQR for Webpage {i}: {outliers}") #print(f"outliers: {IQR.detect_outliers_with_IQR(df['3_1_www.google.com'])}") outliers_list.append(page_outliers_dict) if (is_Simul): WF.write_csv(df_removed, "lqr_all_points_removed_" + file_name) return "IQR_ALL_POINTS", path_to_file_name, outliers_list, len( outliers_list), df, df_removed, len(header_list) - 1
def read_files(feature_names, urls_names): path = '/home/batyi/Desktop/Study/Study project/website_fingerprinting_proxy-stable/workspace/features_files/' sourcepath = os.listdir(path) # print(len(sourcepath)) df_main = pd.DataFrame() # to measure maximum number if rows among files # max_row_length = 0 # for name in feature_names: # for sfile in sourcepath: # if name+'_tls.csv' == sfile: # df = pd.read_csv(path + name+'_tls.csv') # if max_row_length < df.shape[0]: # max_row_length = df.shape[0] # print(max_row_length) 29021 t = 0 for name in feature_names: for sfile in sourcepath: if name + '_tls.csv' == sfile: df = pd.read_csv(path + name + '_tls.csv') # print(df.shape[0]) # print(df['direction']) df = df.replace({'direction': 0}, -1) df_main = df_main.append(df['direction'], ignore_index=True, sort=False) t += 1 break # if t > 1000: # break df_main = df_main.fillna(0) df_main = df_main.loc[:, 0:4999] df_main['index'] = feature_names # df_main['index'] = feature_names[0:1001] #Rearrange cols in any way you want, moved the last element to the first position: cols = df_main.columns.tolist() cols = cols[-1:] + cols[:-1] df_main = df_main[cols] df_main['site'] = urls_names # df_main['site'] = urls_names[0:1001] WF.write_csv(df_main, "cnn_dataframe.csv") print(df_main)
def find_callfor_LR(URL, path_to_file_name, is_Simul): #call multi_files_names() function to get specific file name for Marged Data file_name = search_marge_files_names(URL) #call merge_file_path() function to enter into the directory of "file_name" #for find out the file. file_name1 = merge_file_path(file_name) #print(file_name) df = pd.read_csv(file_name1) df_removed = df #in future here will be stored adjsted dataframe outliers_list = [] header_list = list(df.columns) index = header_list[0] df2 = LR.get_summary_fv_ds() page_outliers_dict = {} for i in header_list[1:41]: # Create the pandas DataFrame df1 = pd.DataFrame(data=df, columns=['index', i]) # print(len(df1.columns)) # print(df1.head()) # break outliers_RMS, max_point = LR.detect_outliers_with_LR(df1, df2) #print(f"count {c}: {outliers_R}") # break if outliers_RMS > max_point: page_outliers_dict[i] = "1" df_removed = df_removed.drop(columns=i) #remove outlier's column #print(f"outliers cal. with LR for Webpage {i}: {outliers_R}") #print(f"outliers: {IQR.detect_outliers_with_IQR(df['3_1_www.google.com'])}") #break if (len(page_outliers_dict) != 0): outliers_list.append(page_outliers_dict) if (is_Simul): WF.write_csv(df_removed, "lr_all_points_removed_" + file_name) return "LR_ALL_POINTS", path_to_file_name, outliers_list, len( outliers_list), df, df_removed, len(header_list) - 1
def find_min_max_mean(URL): #call multi_files_names() function to get specific file name for Marged Data file_name = multi_files_names(URL) #call merge_file_path() function to enter into the directory of "file_name" #for find out the file. file_name = merge_file_path(file_name) #print(file_name) df = pd.read_csv(file_name) #print(df.head()) header_list = list(df.columns) #print(header_list[0:-1]) #print(len(header_list[1:41])) #https: // stackoverflow.com/questions/12920214/python-for-syntax-block-code-vs-single-line-generator-expressions #spliting the header new_header_list = [] for s in header_list[1:41]: s = s.split("_") # Here feature column Header is splited #print(s) # Here Header identifier and crawl number is concatenated new_header_list.append(s[0] + "." + s[1]) #print(new_header_list) #Research Ref: # https: // www.shanelynn.ie/using-pandas-dataframe-creating-editing-viewing-data-in-python/ df_data = pd.DataFrame(data=df) # Delete the "Area" column from the dataframe df_data = df_data.drop(columns="index") #print(df_data.head()) #df_data = df_data.rename(columns=lambda x: x.replace('_')) #research Ref: #https://stackoverflow.com/questions/11346283/renaming-columns-in-pandas # Rename multiple columns in one go with a larger dictionary #header_list_new = df_data.columns.str.replace('_', '.') df_data.columns = new_header_list #print(list(df_data.columns)) #print(df_data.head()) sns.boxplot(x="variable", y="value", data=pd.melt(df_data)) plt.title('Detecting Outliers') plt.show() #research Ref: #https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.describe.html #https://www.geeksforgeeks.org/adding-new-column-to-existing-dataframe-in-pandas/ #https://stackoverflow.com/questions/20461165/how-to-convert-index-of-a-pandas-dataframe-into-a-column #Create new dataset to describe the summary of the Dataset. s_data = pd.DataFrame(df.describe(include='all')) s_data = s_data.drop(columns="index") # Using DataFrame.insert() to add a column """ s_data.insert(0, "describe", ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'], True) """ #temp = s_data.index.values #s_data.insert(0, column="data_summary", value=temp) # print(s_data) # print(s_data.columns) #https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.transpose.html #https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html #Here we have transform the rows into the columns df_transposed = s_data.T # or df1.transpose() df_transposed = df_transposed.reset_index().set_index('index', drop=True) #df_transposed.index.name = None #print(df_transposed.head()) #print(df_transposed.columns) #https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html #https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#io-feather #write the Pandas DataFrame into a output file in output Directory. WF.write_csv(df_transposed) WF.write_ori(df_transposed) #print(df) # sns.boxplot(x=df[header_list[1:11]]) # plt.show() return # while True: # pass pass