Пример #1
0
def bin_data(path, write_path, num_chunks, binning):
    """Bins the continuous features through bucket or quantile binning

    Parameter
    ---------
    path : str
      The path where the dataset to be binned is located.
    write_path : str
      The path where to save the binned dataset.
    num_chunks : int
      The number of file splits to perform on the binned dataset.
    binning : int
      The type of binning to perform on the dataset: 0 if bucket binning, 1 if quantile binning.
    """

    # get the list of files found in PATH
    files = nd.list_files(path=path)

    df = pd.DataFrame()

    for file in files:
        # append the data from CSV files to the dataframe
        df = df.append(pd.read_csv(filepath_or_buffer=file,
                                   names=column_names))
        print("appending : {}".format(file))

    # remove dst_ip_add and src_ip_add features
    df = df.drop(labels=["dst_ip_add", "src_ip_add"], axis=1)

    for index in range(len(cols_to_std)):
        if int(binning) == 0:
            # bucket binning
            bins = np.linspace(df[cols_to_std[index]].min(),
                               df[cols_to_std[index]].max(), 10)
            df[cols_to_std[index]] = np.digitize(df[cols_to_std[index]],
                                                 bins,
                                                 right=True)
            print("min : {}, max : {}".format(df[cols_to_std[index]].min(),
                                              df[cols_to_std[index]].max()))

        if int(binning) == 1:
            # decile binning
            df[cols_to_std[index]] = pd.qcut(df[cols_to_std[index]],
                                             10,
                                             labels=False,
                                             duplicates="drop")
            print("min : {}, max : {}".format(df[cols_to_std[index]].min(),
                                              df[cols_to_std[index]].max()))

    for id, df_i in enumerate(np.array_split(df, num_chunks)):
        # split and save the dataframe to CSV files
        df_i.to_csv(
            path_or_buf=os.path.join(write_path, "{id}.csv".format(id=id)),
            columns=columns_to_save,
            header=None,
            index=False,
        )
        print("Saving CSV file : {path}".format(
            path=os.path.join(write_path, "{id}".format(id=id))))
Пример #2
0
def csv_to_npy(csv_path, npy_path, npy_filename):
    files = list_files(path=csv_path)

    df = pd.DataFrame()

    for file in files:
        df = df.append(pd.read_csv(filepath_or_buffer=file, header=None))
        print('Appending file : {}'.format(file))

    df = df.drop_duplicates(subset=df, keep='first', inplace=False)

    data = np.array(df)

    np.save(file=os.path.join(npy_path, npy_filename), arr=data)
Пример #3
0
def convert_txt_to_csv(txt_path, csv_path):
    """Converts the Kyoto University dataset TXT files to CSV files

    Parameter
    ---------
    txt_path : str
      The path where the TXT files are located.
    csv_path : str
      The path where to save the CSV-converted files.
    """

    # list to store the filenames under the subdirectories of the <path>
    data = list_files(path=txt_path)

    csv_data = []  # list to store the converted CSV files

    # Create the <csv_path> if it does not exist
    os.makedirs(csv_path) if not os.path.exists(csv_path) else print(
        "CSV folder exists")

    for month in range(12):
        """ Create the subdirectories under the <csv_path> if it does not exist """
        if next(walk(csv_path))[1].__len__() == 12:
            print("Folders exist")
            break
        print("Creating subdirectories.")
        # get the dirpath from the generator object <walk> (index 0)
        # then joins the dirpath with the month number
        os.makedirs(
            os.path.join(
                next(walk(csv_path))[0],
                "0" + str(month + 1) if month < 9 else str(month + 1),
            ))

    for index in range(len(data)):
        """ Store the processed CSV filename to <csv_data> list """
        csv_data.append(
            os.path.join(csv_path,
                         data[index].split(csv_path)[1].replace("txt", "csv")))

    for index in range(len(data)):
        """ Reading the text files delimited with tab, and converts it to CSV """
        try:
            print("Processing: {}".format(data[index]))
            in_csv = csv.reader(open(data[index], "r"), delimiter="\t")
            out_csv = csv.writer(open(csv_data[index], "x"))
            out_csv.writerows(in_csv)
        except FileNotFoundError:
            print("File not found: {}".format(data[index]))