def bin_data(path, write_path, num_chunks, binning): """Bins the continuous features through bucket or quantile binning Parameter --------- path : str The path where the dataset to be binned is located. write_path : str The path where to save the binned dataset. num_chunks : int The number of file splits to perform on the binned dataset. binning : int The type of binning to perform on the dataset: 0 if bucket binning, 1 if quantile binning. """ # get the list of files found in PATH files = nd.list_files(path=path) df = pd.DataFrame() for file in files: # append the data from CSV files to the dataframe df = df.append(pd.read_csv(filepath_or_buffer=file, names=column_names)) print("appending : {}".format(file)) # remove dst_ip_add and src_ip_add features df = df.drop(labels=["dst_ip_add", "src_ip_add"], axis=1) for index in range(len(cols_to_std)): if int(binning) == 0: # bucket binning bins = np.linspace(df[cols_to_std[index]].min(), df[cols_to_std[index]].max(), 10) df[cols_to_std[index]] = np.digitize(df[cols_to_std[index]], bins, right=True) print("min : {}, max : {}".format(df[cols_to_std[index]].min(), df[cols_to_std[index]].max())) if int(binning) == 1: # decile binning df[cols_to_std[index]] = pd.qcut(df[cols_to_std[index]], 10, labels=False, duplicates="drop") print("min : {}, max : {}".format(df[cols_to_std[index]].min(), df[cols_to_std[index]].max())) for id, df_i in enumerate(np.array_split(df, num_chunks)): # split and save the dataframe to CSV files df_i.to_csv( path_or_buf=os.path.join(write_path, "{id}.csv".format(id=id)), columns=columns_to_save, header=None, index=False, ) print("Saving CSV file : {path}".format( path=os.path.join(write_path, "{id}".format(id=id))))
def csv_to_npy(csv_path, npy_path, npy_filename): files = list_files(path=csv_path) df = pd.DataFrame() for file in files: df = df.append(pd.read_csv(filepath_or_buffer=file, header=None)) print('Appending file : {}'.format(file)) df = df.drop_duplicates(subset=df, keep='first', inplace=False) data = np.array(df) np.save(file=os.path.join(npy_path, npy_filename), arr=data)
def convert_txt_to_csv(txt_path, csv_path): """Converts the Kyoto University dataset TXT files to CSV files Parameter --------- txt_path : str The path where the TXT files are located. csv_path : str The path where to save the CSV-converted files. """ # list to store the filenames under the subdirectories of the <path> data = list_files(path=txt_path) csv_data = [] # list to store the converted CSV files # Create the <csv_path> if it does not exist os.makedirs(csv_path) if not os.path.exists(csv_path) else print( "CSV folder exists") for month in range(12): """ Create the subdirectories under the <csv_path> if it does not exist """ if next(walk(csv_path))[1].__len__() == 12: print("Folders exist") break print("Creating subdirectories.") # get the dirpath from the generator object <walk> (index 0) # then joins the dirpath with the month number os.makedirs( os.path.join( next(walk(csv_path))[0], "0" + str(month + 1) if month < 9 else str(month + 1), )) for index in range(len(data)): """ Store the processed CSV filename to <csv_data> list """ csv_data.append( os.path.join(csv_path, data[index].split(csv_path)[1].replace("txt", "csv"))) for index in range(len(data)): """ Reading the text files delimited with tab, and converts it to CSV """ try: print("Processing: {}".format(data[index])) in_csv = csv.reader(open(data[index], "r"), delimiter="\t") out_csv = csv.writer(open(csv_data[index], "x")) out_csv.writerows(in_csv) except FileNotFoundError: print("File not found: {}".format(data[index]))