def IQR(file_name, path_to_file_name, is_Simul, isMax, max_number_or_var,
        min_number):
    df, df_cumul_sum, list_of_dfs = prepare_data(
        path_to_file_name)  #bring necessary data in a proper format

    df_removed = df  #in future here will be stored adjsted dataframe
    outliers_list = []  #outliers based on final cumul sum
    number_of_outliers = 0
    for index in range(len(list_of_dfs)):  #go through dataframe of each page
        Q1 = list_of_dfs[index].quantile(0.25)
        Q3 = list_of_dfs[index].quantile(0.75)
        IQR = Q3 - Q1
        #df_checked = (list_of_dfs[index] < (Q1 - 1.5 * IQR)) | (list_of_dfs[index] > (Q3 + 1.5 * IQR))
        df_outliers = list_of_dfs[index][(
            (list_of_dfs[index] <
             (Q1 - 1.5 * IQR)) | (list_of_dfs[index] >
                                  (Q3 + 1.5 * IQR)))]  #dataframe of outliers
        #print(df_outliers)
        df_outliers, isMax, max_number_or_var, number_of_outliers, df_removed, outliers_list = upper_bound_check(
            is_Simul, df_outliers, isMax, max_number_or_var,
            number_of_outliers, df_removed, outliers_list)
    # print(outliers_list)
    df_removed, list_of_random_outliers = upper_rand_check(
        outliers_list, df, max_number_or_var)
    if (is_Simul == False):
        WF.write_csv(df_removed, "iqr_removed_" + file_name)
    print("Outliers based on IQR based on final CUMUL_SUM:")
    # print(df)
    # print(df_removed)
    print_comparison("IQR_CUMUL_SUM", path_to_file_name,
                     outliers_list, number_of_outliers, df, df_removed,
                     len(df_cumul_sum.index))  #print information
Exemplo n.º 2
0
def data_preparation():
    #print(SUP.read_urls())
    urls_list = SUP.read_urls()

    joined_df = pd.DataFrame()
    for index, url in enumerate(urls_list):
        file_name = SUP.multi_files_names(
            url
        )  #call multi_files_names() function to get specific file name for Marged Data

        path_to_file_name = SUP.merge_file_path(
            file_name
        )  #call merge_file_path() function to enter into the directory of "file_name"

        df, df_cumul_sum, list_of_dfs = prepare_data(
            path_to_file_name)  #bring necessary data in a proper format
        df = df.transpose()
        df.loc[0:40, 'site'] = url
        joined_df = joined_df.append(df)  #ignore_index=True
        #print(df)
        # if index == 1:
        # 	break
    print(joined_df)
    WF.write_csv(joined_df, "joined_df" + file_name)
    return joined_df
def z_score(file_name, path_to_file_name, is_Simul, isMax, max_number_or_var,
            min_number):
    df, df_cumul_sum, list_of_dfs = prepare_data(
        path_to_file_name)  #bring necessary data in a proper format

    df_removed = df  #in future here will be stored adjsted dataframe
    outliers_list = []  #outliers based on final cumul sum
    threshold = 1.8
    number_of_outliers = 0
    for index in range(len(list_of_dfs)):  #go through dataframe of each page
        #print(list_of_dfs[index])
        mean = np.mean(list_of_dfs[index])  #mean from instances of one page
        std = np.std(list_of_dfs[index])  #std from instances of one page
        df_outliers = list_of_dfs[index][
            np.abs(list_of_dfs[index] - mean) / std >
            threshold]  #dataframe of outliers based on z = (X — μ) / σ
        df_outliers, isMax, max_number_or_var, number_of_outliers, df_removed, outliers_list = upper_bound_check(
            is_Simul, df_outliers, isMax, max_number_or_var,
            number_of_outliers, df_removed, outliers_list)
    if (is_Simul):
        WF.write_csv(df_removed, "z-score_removed_" + file_name)
    print("Outliers based on Z-score based on final CUMUL_SUM:")
    print_comparison("Z-score_CUMUL_SUM", path_to_file_name,
                     outliers_list, number_of_outliers, df, df_removed,
                     len(df_cumul_sum.index))  #print information
Exemplo n.º 4
0
def find_callfor_IQR(URL, path_to_file_name, is_Simul):
    #call multi_files_names() function to get specific file name for Marged Data
    file_name = search_marge_files_names(URL)
    #call merge_file_path() function to enter into the directory of "file_name"
    #for find out the file.
    file_name1 = merge_file_path(file_name)
    #print(file_name)
    df = pd.read_csv(file_name1)
    df_removed = df  #in future here will be stored adjsted dataframe
    header_list = list(df.columns)
    outliers_list = []
    page_outliers_dict = {}
    for i in header_list[1:41]:
        outliers = IQR.detect_outliers_with_IQR(df[i])
        if len(outliers) > 0:
            page_outliers_dict[i] = "1"
            #outliers_list.append(i)
            df_removed = df_removed.drop(columns=i)  #remove outlier's column
            #print(f"outliers cal. with IQR for Webpage {i}: {outliers}")
    #print(f"outliers: {IQR.detect_outliers_with_IQR(df['3_1_www.google.com'])}")
    outliers_list.append(page_outliers_dict)
    if (is_Simul):
        WF.write_csv(df_removed, "lqr_all_points_removed_" + file_name)
    return "IQR_ALL_POINTS", path_to_file_name, outliers_list, len(
        outliers_list), df, df_removed, len(header_list) - 1
def read_files(feature_names, urls_names):
    path = '/home/batyi/Desktop/Study/Study project/website_fingerprinting_proxy-stable/workspace/features_files/'
    sourcepath = os.listdir(path)
    # print(len(sourcepath))
    df_main = pd.DataFrame()
    # to measure maximum number if rows among files
    # max_row_length = 0
    # for name in feature_names:
    # 	for sfile in sourcepath:
    # 		if name+'_tls.csv' == sfile:
    # 			df = pd.read_csv(path + name+'_tls.csv')
    # 			if max_row_length < df.shape[0]:
    # 				max_row_length = df.shape[0]
    # print(max_row_length) 29021
    t = 0
    for name in feature_names:
        for sfile in sourcepath:
            if name + '_tls.csv' == sfile:
                df = pd.read_csv(path + name + '_tls.csv')
                # print(df.shape[0])
                # print(df['direction'])

                df = df.replace({'direction': 0}, -1)
                df_main = df_main.append(df['direction'],
                                         ignore_index=True,
                                         sort=False)
                t += 1
                break
        # if t > 1000:
        # 	break
    df_main = df_main.fillna(0)
    df_main = df_main.loc[:, 0:4999]
    df_main['index'] = feature_names
    # df_main['index'] = feature_names[0:1001]

    #Rearrange cols in any way you want, moved the last element to the first position:
    cols = df_main.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df_main = df_main[cols]

    df_main['site'] = urls_names
    # df_main['site'] = urls_names[0:1001]
    WF.write_csv(df_main, "cnn_dataframe.csv")
    print(df_main)
Exemplo n.º 6
0
def find_callfor_LR(URL, path_to_file_name, is_Simul):
    #call multi_files_names() function to get specific file name for Marged Data
    file_name = search_marge_files_names(URL)
    #call merge_file_path() function to enter into the directory of "file_name"
    #for find out the file.
    file_name1 = merge_file_path(file_name)
    #print(file_name)
    df = pd.read_csv(file_name1)
    df_removed = df  #in future here will be stored adjsted dataframe

    outliers_list = []
    header_list = list(df.columns)
    index = header_list[0]
    df2 = LR.get_summary_fv_ds()
    page_outliers_dict = {}
    for i in header_list[1:41]:

        # Create the pandas DataFrame
        df1 = pd.DataFrame(data=df, columns=['index', i])
        # print(len(df1.columns))
        # print(df1.head())
        # break
        outliers_RMS, max_point = LR.detect_outliers_with_LR(df1, df2)
        #print(f"count {c}: {outliers_R}")
        # break
        if outliers_RMS > max_point:
            page_outliers_dict[i] = "1"

            df_removed = df_removed.drop(columns=i)  #remove outlier's column
            #print(f"outliers cal. with LR for Webpage {i}: {outliers_R}")
    #print(f"outliers: {IQR.detect_outliers_with_IQR(df['3_1_www.google.com'])}")
    #break
    if (len(page_outliers_dict) != 0):
        outliers_list.append(page_outliers_dict)
    if (is_Simul):
        WF.write_csv(df_removed, "lr_all_points_removed_" + file_name)
    return "LR_ALL_POINTS", path_to_file_name, outliers_list, len(
        outliers_list), df, df_removed, len(header_list) - 1
Exemplo n.º 7
0
def find_min_max_mean(URL):
    #call multi_files_names() function to get specific file name for Marged Data
    file_name = multi_files_names(URL)
    #call merge_file_path() function to enter into the directory of "file_name"
    #for find out the file.
    file_name = merge_file_path(file_name)
    #print(file_name)
    df = pd.read_csv(file_name)

    #print(df.head())
    header_list = list(df.columns)
    #print(header_list[0:-1])
    #print(len(header_list[1:41]))
    #https: // stackoverflow.com/questions/12920214/python-for-syntax-block-code-vs-single-line-generator-expressions
    #spliting the header
    new_header_list = []
    for s in header_list[1:41]:
        s = s.split("_")  # Here feature column Header is splited
        #print(s)
        # Here Header identifier and crawl number is concatenated
        new_header_list.append(s[0] + "." + s[1])
    #print(new_header_list)
    #Research Ref:
    # https: // www.shanelynn.ie/using-pandas-dataframe-creating-editing-viewing-data-in-python/
    df_data = pd.DataFrame(data=df)
    # Delete the "Area" column from the dataframe
    df_data = df_data.drop(columns="index")

    #print(df_data.head())

    #df_data = df_data.rename(columns=lambda x: x.replace('_'))
    #research Ref:
    #https://stackoverflow.com/questions/11346283/renaming-columns-in-pandas
    # Rename multiple columns in one go with a larger dictionary
    #header_list_new = df_data.columns.str.replace('_', '.')

    df_data.columns = new_header_list
    #print(list(df_data.columns))
    #print(df_data.head())

    sns.boxplot(x="variable", y="value", data=pd.melt(df_data))
    plt.title('Detecting Outliers')
    plt.show()

    #research Ref:
    #https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.describe.html
    #https://www.geeksforgeeks.org/adding-new-column-to-existing-dataframe-in-pandas/
    #https://stackoverflow.com/questions/20461165/how-to-convert-index-of-a-pandas-dataframe-into-a-column
    #Create new dataset to describe the summary of the Dataset.
    s_data = pd.DataFrame(df.describe(include='all'))
    s_data = s_data.drop(columns="index")
    # Using DataFrame.insert() to add a column
    """
  s_data.insert(0, "describe", ['count', 'mean',
                                'std', 'min', '25%', '50%', '75%', 'max'], True)
  """
    #temp = s_data.index.values
    #s_data.insert(0, column="data_summary", value=temp)

    # print(s_data)
    # print(s_data.columns)
    #https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.transpose.html
    #https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html
    #Here we have transform the rows into the columns
    df_transposed = s_data.T  # or df1.transpose()
    df_transposed = df_transposed.reset_index().set_index('index', drop=True)
    #df_transposed.index.name = None
    #print(df_transposed.head())
    #print(df_transposed.columns)

    #https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
    #https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#io-feather
    #write the Pandas DataFrame into a output file in output Directory.
    WF.write_csv(df_transposed)
    WF.write_ori(df_transposed)

    #print(df)
    # sns.boxplot(x=df[header_list[1:11]])
    # plt.show()
    return
    # while True:

    #   pass

    pass