Exemplo n.º 1
0
    def __init__(self, phase, num_categories, shuffle=False):
        # Load the complete modelnet40 training data
        # Training data is dispersed over 5 files,
        # load them all into data (point clouds) and labels
        points = []
        labels = []
        i = 0
        while True:
            filename = os.path.join("modelnet40",
                                    "ply_data_%s%i.h5" % (phase, i))
            if not os.path.isfile(filename): break
            p, l = utils.load_h5(filename)
            points.extend(np.array(p))
            labels.extend(np.squeeze(np.array(l), 1))
            i += 1

        self.num_examples = len(points)
        self._index_in_epoch = 0
        self._points = np.array(points)
        self._labels = np.zeros((self.num_examples, num_categories))
        self._labels[np.arange(self.num_examples), np.array(labels)] = 1
        self._shuffle = shuffle
        self._epoch_complete = False

        if self._shuffle:
            self._shuffle_data()
Exemplo n.º 2
0
def ipheadertask(filelist):
    j = 1
    for fullname in filelist:
        print("Loading filenr: {}".format(j))
        load_dir, filename = os.path.split(fullname)
        df = utils.load_h5(load_dir, filename)
        frames = df['bytes'].values
        for i, frame in enumerate(frames):
            p = np.fromstring(frame, dtype=np.uint8)
            if p[14] != 69:
                print("IP Header length not 20! in file {0}".format(filename))
        j += 1
seed = 0
num_headers = 16
dirs = ["C:/Users/salik/Documents/Data/LinuxChrome/{}/".format(num_headers),
        "C:/Users/salik/Documents/Data/WindowsFirefox/{}/".format(num_headers),
        "C:/Users/salik/Documents/Data/WindowsChrome/{}/".format(num_headers),
        "C:/Users/salik/Documents/Data/WindowsSalik/{}/".format(num_headers),
        "C:/Users/salik/Documents/Data/WindowsAndreas/{}/".format(num_headers)]
# dirs = ["E:/Data/h5/https/", "E:/Data/h5/netflix/"]
# dirs = ["C:/Users/salik/Documents/Data/WindowsAndreas/{}/".format(num_headers)]
# step 1: get the data
dataframes = []
num_examples = 0
for dir in dirs:
    for fullname in glob.iglob(dir + '*.h5'):
        filename = os.path.basename(fullname)
        df = utils.load_h5(dir, filename)
        dataframes.append(df)
        num_examples = len(df.values)
    # create one large dataframe
data = pd.concat(dataframes)
data.sample(frac=1, random_state=seed).reset_index(drop=True)
num_rows = data.shape[0]
columns = data.columns
print(columns)

# step 3: get features (x) and scale the features
# get x and convert it to numpy array
# x = da.getbytes(data, 1460)
standard_scaler = StandardScaler()
x = da.getbytes(data, num_headers*54)
x_std = standard_scaler.fit_transform(x)
Exemplo n.º 4
0
def train_model():
    learning_rate = 0.0001
    batch_size = 128
    c_dim = 1
    n_batch = 10000 // batch_size

    x_input = tf.placeholder(tf.float32, [None, None, None, c_dim],
                             name='x_input')
    y_label2 = tf.placeholder(tf.float32, [None, None, None, c_dim],
                              name='y_label2')
    x1_bic_add = tf.placeholder(tf.float32, [None, None, None, c_dim],
                                name='x1_bic_add')

    output = define_model(x_input, x1_bic_add)

    mse_loss = tf.reduce_mean(tf.square(y_label2 - output))
    train = tf.train.GradientDescentOptimizer(learning_rate).minimize(mse_loss)

    saver = tf.train.Saver(max_to_keep=4)
    tf.add_to_collection("predict", output)

    with tf.Session() as sess:
        init = tf.global_variables_initializer()
        sess.run(init)

        # load train set
        x1d = load_h5('Y_dataset.h5', 'train_cut_l')
        x2d = load_h5('Y_dataset.h5', 'train_cut_h')
        x1_b = load_h5('Y_dataset.h5', 'train_cut_b')
        x1_data = x1d[:10000, :, :, :] / 255
        x2_data = x2d[:10000, :, :, :] / 255
        x1_bic = x1_b[:10000, :, :, :] / 255
        print(x1_data.shape, x2_data.shape, x1_bic.shape)

        counter = 0

        for i in range(2001):
            for idx in range(0, n_batch):
                batch_images = x1_data[idx * batch_size:(idx + 1) * batch_size]
                batch_bic = x1_bic[idx * batch_size:(idx + 1) * batch_size]
                batch_labels = x2_data[idx * batch_size:(idx + 1) * batch_size]

                sess.run(train,
                         feed_dict={
                             x_input: batch_images,
                             y_label2: batch_labels,
                             x1_bic_add: batch_bic
                         })
                counter += 1
                if counter % 50 == 0:
                    print(
                        'Epoh', i, 'n_batch', idx, 'Train loss:',
                        sess.run(mse_loss,
                                 feed_dict={
                                     x_input: batch_images,
                                     y_label2: batch_labels,
                                     x1_bic_add: batch_bic
                                 }))
            rands(x1_data, x2_data, x1_bic)

            if i % 10 == 0:
                saver.save(sess,
                           r'./checkpoint/' + "model_conv/my-model",
                           global_step=i)
                print("save the model")
Exemplo n.º 5
0
                    "--minclass",
                    type=int,
                    nargs="?",
                    dest='minclass',
                    help='Minimum Number of Classes')
parser.add_argument("-w",
                    "--maxclass",
                    type=int,
                    nargs="?",
                    dest='maxclass',
                    help='Maximum Number of Classes')
parser.add_argument("-r",
                    "--repeat",
                    type=bool,
                    nargs="?",
                    dest='repeat',
                    help='Allow Repeated Data Among Clients')
args = parser.parse_args()

trainX, trainY = load_h5("cifar/train_data.h5")
labels = load_labels("cifar/labels.h5")
separated_data = separate_data_by_class(trainX, trainY, labels)
generate_client_dataset_files(dataset=separated_data,
                              directory=args.name,
                              n_clients=args.total,
                              n_samples_min=args.minsample,
                              n_samples_max=args.maxsample,
                              n_classes_min=args.minclass,
                              n_classes_max=args.maxclass,
                              no_repeat=not (args.repeat))
Exemplo n.º 6
0
# coding: utf-8

# In[69]:

import tensorflow as tf
import utils
import numpy as np
import os

# In[71]:

# load training data and labels
data0 = utils.load_h5("ply_data_train0.h5")
data1 = utils.load_h5("ply_data_train1.h5")
data2 = utils.load_h5("ply_data_train2.h5")
data3 = utils.load_h5("ply_data_train3.h5")
data4 = utils.load_h5("ply_data_train4.h5")

# train_data = data0[0]
# print(np.shape(train_data))

# train_labels = data0[1]
# catagory_names = utils.get_category_names()
# print(np.shape(train_labels))

# In[72]:

# aggregate training data, training label
train_data = np.append(data0[0], data1[0], axis=0)
train_data = np.append(train_data, data2[0], axis=0)
train_data = np.append(train_data, data3[0], axis=0)
Exemplo n.º 7
0

    data = f(SELECTED_SUBSET)
    match_links = link(SELECTED_SUBSET)
    #print("Match links:", match_links)

    print("Precompute SIFT, MSER, {} on {} images. Proceed?".format(SELECTED_VLAD, SELECTED_SUBSET))

    print("Computing SIFT...")
    # 1. Compute SIFT descriptors from a dataset. The supported descriptors are ORB, SIFT and SURF:
    part = 'oxford5k_{}'.format(SELECTED_SUBSET)
    save_to = "pickles/{}_sifts_{}.h5".format(part, SELECTED_VLAD)
    patch_dict = {}
    if SELECTED_VLAD == 'pb-vlad':
        #print("images: ", data)
        sift_descriptors = load_h5(save_to, getPBDescriptors, data_dir, extractor(SELECTED_VLAD), optional_file_list=data)
        #sift_descriptors = load_h5(save_to, generate_patch_data, data_dir, data, patch_dict, rescale_images=False)    
        #save_to = "pickles/{}_patchdict_{}.h5".format(part, SELECTED_VLAD)
        #patch_dict = load_joblib(save_to, lambda data: data, patch_dict)
    else:
        sift_descriptors = load_h5(save_to, getDescriptors, data_dir, extractor(SELECTED_VLAD), optional_file_list=data)
    print("SIFT shape: ", sift_descriptors.shape)

    print("Constructing VISUAL dictionary...")
    # 2. Construct a visual dictionary from the descriptors in path -d, with -w visual words:
    number_of_visual_words = 128
    #visual_dictionary=kMeansDictionary(sift_descriptors, number_of_visual_words)
    #save_to = "pickles/vlad_dict_{}_{}.pickle".format(SELECTED_VLAD, part)
    #visual_dictionary = load_pickle(save_to, kMeansDictionary, sift_descriptors, number_of_visual_words)
    save_to = "pickles/{}_visual-dict_{}.joblib".format(part, 'orig-vlad') #, "orig-vlad") # test if orig-vlad dict helps: SELECTED_VLAD)
    visual_dictionary = load_joblib(save_to, kMeansDictionary, sift_descriptors, number_of_visual_words)
Exemplo n.º 8
0
def remove_checksum(nrheaders):
    # nrheaders = 1
    read_dir = '/home/mclrn/Data/linux/'
    headers = '{0}/'.format(nrheaders)
    dataframes = []
    for fullname in glob.iglob(read_dir + headers + '*.h5'):
        filename = os.path.basename(fullname)
        df = utils.load_h5(read_dir + headers, filename)
        dataframes.append(df)
    # create one large dataframe
    df = pd.concat(dataframes)
    # df = pd.read_hdf('/home/mclrn/Data/salik_windows/{0}/'.format(nrheaders), key="extracted_{0}".format(nrheaders))
    df = df.sample(frac=1).reset_index(drop=True)
    values, counts = np.unique(df['label'], return_counts=True)
    print(values, counts)

    #selector = df['label'] == 'youtube'

    values = df['bytes'].values
    bytes = np.zeros((values.shape[0], nrheaders * 54))
    for i, v in enumerate(values):
        payload = np.zeros(nrheaders * 54, dtype=np.uint8)
        payload[:v.shape[0]] = v
        bytes[i] = payload

    #mean = np.mean(bytes, axis=0)
    #min = np.min(bytes, axis=0)
    #max = np.max(bytes, axis=0)
    #print(np.max(bytes[0:, 23]))  # Protocol field if value = 6 then TCP if value = 17 the UDP
    bytes_no_checksum = []
    for j, b in enumerate(bytes):
        if b[23] == 6:
            # TCP
            # if bytenumber in (50, 51):
            #   return "Checksum (TCP header)"
            for i in range(nrheaders):
                b[i * 54 + 50] = 0
                b[i * 54 + 51] = 0
                b[i * 54 + 24] = 0
                b[i * 54 + 25] = 0
        elif b[23] == 17:
            # UDP
            # if bytenumber in (40,41)
            # return "UDP Checksum (UDP Header)"
            for i in range(nrheaders):
                b[i * 42 + 40] = 0
                b[i * 42 + 41] = 0
                b[i * 42 + 24] = 0
                b[i * 42 + 25] = 0
        else:
            print("Byte was not 6 nor 17 but: %d" % bytes[23])

        bytes_no_checksum.append(b)
    new_data = {'bytes': bytes_no_checksum, 'label': df['label'].values}
    new_df = pd.DataFrame(new_data)
    # print(df)
    # print(new_df)
    save_dir = read_dir + "no_checksum/" + headers
    # if not os.path.exists(save_dir):
    os.makedirs(save_dir, exist_ok=True)
    new_df.to_hdf(save_dir + "extracted_{0}-no_checksum".format(nrheaders) +
                  '.h5',
                  key='extracted_{0}'.format(nrheaders),
                  mode='w')
Exemplo n.º 9
0
def load(fname, fdir, fs, trial_start_end_seconds, conditions, num_avg_groups):
    """
    Takes in a numpy 2d array and a subplot location, and plots a heatmap at the subplot location without axes

    Parameters
    ----------
    fname : string
        file name

    fdir : string
        root file directory. Needs to have a "_framenumberforevents.pkl" file that corresponds to the session!!

    fs : float
        Sampling rate of the recording

    trial_start_end_seconds : list 
        list with two float entries. First 
    
    conditions : list
        list of strings that correspond to the behavioral conditions to be analyzed 
    
    num_avg_groups : int
        Number of segments to split and average the trials over. Ie. Because single trial plots in state space is noisy, 
        we break the trials up into groups and average to get less noisier signal.

    Returns
    -------
    data_dict : dictionary
            1st level of dict keys: individual conditions + condition combined data
                2nd level of keys :
                    data : numpy 4d array with dimensions (trials,y,x,samples)
                    num_samples : number of samples (time) in a trial
                    num_trials : total number of trials in the condition

    """

    sima_h5_path = os.path.join(fdir, fname + '_sima_mc.h5')
    data_snip = utils.load_h5(sima_h5_path)

    data_dims = data_snip.shape
    tvec = np.linspace(0, data_dims[2] / fs, data_dims[2])

    #load behavioral data and trial info
    try:
        glob_frame_files = glob.glob(
            fdir +
            "framenumberforevents*")  # look for a file in specified directory
        frame_events = pickle.load(
            open(glob_frame_files[0], "rb"),
            encoding="latin1")  # latin1 b/c original pickle made in python 2
    except:
        print(
            'Cannot find behavioral data file or file path is incorrect; utils.extract_trial_data will throw error.'
        )

    # with trial start/end samples,
    trial_window_samp = trial_start_end_seconds * fs  # turn trial start/end times to samples
    data_dict = utils.extract_trial_data(data_snip, trial_window_samp[0],
                                         trial_window_samp[1], frame_events,
                                         conditions)
    """let's load data into xarray format, which has numerous 
    advantages over using numpy arrays, one of which is the ability 
    to assign names to dimensions rather than indexing by ints """

    for condition in conditions:

        # create index vectors for data dimensions; xarrayy stores these indices (eg. encodes time in seconds in place of samples)
        ypix_vec = range(0, data_dims[0])
        xpix_vec = range(0, data_dims[1])
        flattenpix_vec = range(0, data_dims[0] * data_dims[1])
        trials_vec = range(data_dict[condition]['num_trials'])
        data_dict['trial_tvec'] = np.linspace(
            trial_start_end_seconds[0], trial_start_end_seconds[1],
            data_dict[condition]['num_samples'])

        # xarray with dimensions: x,y,trial,samples
        data_dict[condition]['xarr_data'] = xarray.DataArray(
            data_dict[condition]['data'],
            coords=[trials_vec, ypix_vec, xpix_vec, data_dict['trial_tvec']],
            dims=['trial', 'y', 'x', 'time'])

        # flatten x and y pixels into one dimension
        # reshape data and make xarray with dims: x-y,trial,samples
        flatten_pix_trial_data = np.reshape(
            data_dict[condition]['data'],
            (len(trials_vec), data_dims[0] * data_dims[1],
             len(data_dict['trial_tvec'])))
        data_dict[condition]['xarr_flatten_xy'] = xarray.DataArray(
            flatten_pix_trial_data,  # this flattens only the x,y dimensions
            coords=[trials_vec, flattenpix_vec, data_dict['trial_tvec']],
            dims=['trial', 'yx', 'time'])

        # average across trials
        data_dict[condition]['xarr_flatten_pix_trialAvg'] = data_dict[
            condition]['xarr_flatten_xy'].mean(dim='trial')

        ### https://stackoverflow.com/questions/43015638/xarray-reshape-data-split-dimension
        # unstack trials into groups and average across trials (avged trials grouped by time)

        num_trials_to_avg = data_dict[condition]['num_trials'] / num_avg_groups

        # need to create a pandas multi-index to tell xarray the target dimensions to unpack into
        ind = pd.MultiIndex.from_product(
            [np.arange(0, num_trials_to_avg),
             np.arange(0, num_avg_groups)],
            names=['trials', 'trial_groups'
                   ])[np.arange(0, data_dict[condition]['num_trials'])]
        # last arange cuts the index list if the number of trials per group does divide evenly into total num trials

        data_dict[condition]['xarr_flatten_xy_group_trials'] = data_dict[
            condition]['xarr_flatten_xy'].assign_coords(
                trial=ind).unstack('trial').mean(dim='trials').transpose(
                    'trial_groups', 'yx', 'time')
        ###

    # pull out all trial-avged data for each cond, then average across conditions
    data_dict['all_cond'] = {}

    # make an array with dimensions trials, xy_pixels, samples where trials from all conditions are stacked in the first dimension
    stacked_data = np.stack([
        data_dict[condition]['xarr_flatten_xy'].data
        for condition in conditions
    ],
                            axis=0)
    data_shape = stacked_data.shape
    data_dict['all_cond']['flattenpix'] = stacked_data.reshape(
        data_shape[0] * data_shape[1], data_shape[2], data_shape[3])

    data_dict['all_cond']['flattenpix_trial_cond_avg'] = np.average([
        data_dict[condition]['xarr_flatten_pix_trialAvg'].data
        for condition in conditions
    ],
                                                                    axis=0)

    return data_dict
Exemplo n.º 10
0
    def make_emb_db(self,
                    args,
                    net,
                    data_loader,
                    eval_sampled,
                    eval_per_class,
                    newly_trained=True,
                    batch_size=None,
                    mode='val'):
        """

        :param batch_size:
        :param eval_sampled:
        :param eval_per_class:
        :param newly_trained:
        :param mode:
        :param args: utils args
        :param net: trained top_model network
        :param data_loader: DataLoader object
        :return: None
        """

        if newly_trained:
            net.eval()
            if batch_size is None:
                batch_size = args.batch_size

            steps = int(np.ceil(len(data_loader) / batch_size))

            test_classes = np.zeros(((len(data_loader.dataset))))
            test_seen = np.zeros(((len(data_loader.dataset))))
            test_paths = np.empty(dtype='S20',
                                  shape=((len(data_loader.dataset))))
            if args.feat_extractor == 'resnet50':
                test_feats = np.zeros((len(data_loader.dataset), 2048))
            elif args.feat_extractor == 'resnet18':
                test_feats = np.zeros((len(data_loader.dataset), 512))
            else:
                raise Exception('Not handled feature extractor')

            for idx, (img, lbl, seen, path) in enumerate(data_loader):

                if args.cuda:
                    img = img.cuda()
                img = Variable(img)

                output = net.forward(img, None, single=True)
                output = output.data.cpu().numpy()

                end = min((idx + 1) * batch_size, len(test_feats))

                test_feats[idx * batch_size:end, :] = output
                test_classes[idx * batch_size:end] = lbl
                test_paths[idx * batch_size:end] = path
                test_seen[idx * batch_size:end] = seen.to(int)

            utils.save_h5(f'{mode}_ids', test_paths, 'S20',
                          os.path.join(self.save_path, f'{mode}Ids.h5'))
            utils.save_h5(f'{mode}_classes', test_classes, 'i8',
                          os.path.join(self.save_path, f'{mode}Classes.h5'))
            utils.save_h5(f'{mode}_feats', test_feats, 'f',
                          os.path.join(self.save_path, f'{mode}Feats.h5'))
            utils.save_h5(f'{mode}_seen', test_seen, 'i2',
                          os.path.join(self.save_path, f'{mode}Seen.h5'))

        test_feats = utils.load_h5(
            f'{mode}_feats', os.path.join(self.save_path, f'{mode}Feats.h5'))
        test_classes = utils.load_h5(
            f'{mode}_classes', os.path.join(self.save_path,
                                            f'{mode}Classes.h5'))
        test_seen = utils.load_h5(
            f'{mode}_seen', os.path.join(self.save_path, f'{mode}Seen.h5'))

        utils.calculate_k_at_n(args,
                               test_feats,
                               test_classes,
                               test_seen,
                               logger=self.logger,
                               limit=args.limit_samples,
                               run_number=args.number_of_runs,
                               save_path=self.save_path,
                               sampled=eval_sampled,
                               per_class=eval_per_class,
                               mode=mode)

        self.logger.info('results at: ' + self.save_path)
def compare_data(path1, path2, nrheaders):
    df1 = pd.DataFrame()
    df2 = pd.DataFrame()

    for fullname in glob.iglob(path1 + '*.h5'):
        filename = os.path.basename(fullname)
        df1 = utils.load_h5(path1, filename)

    for fullname in glob.iglob(path2 + '*.h5'):
        filename = os.path.basename(fullname)
        df2 = utils.load_h5(path2, filename)

    classes = []
    # find all classes
    for label in set(df1['label']):
        classes.append(label)
    print(set(df1['label']))
    print(set(df2['label']))
    # filter on classes
    for c in classes:
        ## Exclude youtube as it contains both UDP and TCP
        if (c == 'youtube'):
            continue

        # create selector
        df1_selector = df1['label'] == c
        df2_selector = df2['label'] == c

        df1_values = df1[df1_selector]['bytes'].values
        df2_values = df2[df2_selector]['bytes'].values

        df1_bytes = np.zeros((df1_values.shape[0], nrheaders * 54))
        df2_bytes = np.zeros((df2_values.shape[0], nrheaders * 54))

        for i, v in enumerate(df1_values):
            payload = np.zeros(nrheaders * 54, dtype=np.uint8)
            payload[:v.shape[0]] = v
            df1_bytes[i] = payload

        for i, v in enumerate(df2_values):
            payload = np.zeros(nrheaders * 54, dtype=np.uint8)
            payload[:v.shape[0]] = v
            df2_bytes[i] = payload

        # Extract byte 23 to determine the protocol.
        TCP = True if int(df2_bytes[0][23]) == 6 else False

        df1_mean = np.mean(df1_bytes, axis=0)
        df2_mean = np.mean(df2_bytes, axis=0)

        df1_min = np.min(df1_bytes, axis=0)
        df2_min = np.min(df2_bytes, axis=0)

        df1_max = np.max(df1_bytes, axis=0)
        df2_max = np.max(df2_bytes, axis=0)

        for index, mean in enumerate(df1_mean):
            if (index % 25 == 0):
                print(c, index)
            if df1_mean[index] > 0 or df2_mean[index] > 0:
                if (int(df1_min[index]) != int(df1_max[index])
                        or int(df2_min[index]) != int(df2_max[index])):
                    if (int(df1_mean[index]) != int(df2_mean[index])
                            or int(df1_min[index]) != int(df2_min[index])
                            or int(df1_max[index]) != int(df2_max[index])):
                        print(index, " : ", int(df1_mean[index]), ' : ',
                              int(df2_mean[index]), int(df1_min[index]),
                              int(df2_min[index]), int(df1_max[index]),
                              int(df2_max[index]))
                        headername = byteindextoheaderfield(index, TCP)
                        headername = headername.replace('/', ' ')
                        createBoxplotsFromColumns(
                            c + headername + ':' + str(index),
                            df1_bytes[:, index], df2_bytes[:, index])
import pca.dataanalyzer
import utils
import glob
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from pca.dataanalyzer import byteindextoheaderfield

num_headers = 8
train_dir = 'C:/Users/salik/Documents/Data/LinuxChrome/{}/'.format(num_headers)
dataframes = []
for fullname in glob.iglob(train_dir + '*.h5'):
    filename = os.path.basename(fullname)
    df = utils.load_h5(train_dir, filename)
    dataframes.append(df)
# create one large dataframe
data = pd.concat(dataframes)
print(len(data))
print("drtv:", len(data[data['label'] == 'drtv']))
print("hbo:", len(data[data['label'] == 'hbo']))
print("http:", len(data[data['label'] == 'http']))
print("https:", len(data[data['label'] == 'https']))
print("netflix:", len(data[data['label'] == 'netflix']))
print("twitch:", len(data[data['label'] == 'twitch']))
print("youtube:", len(data[data['label'] == 'youtube']))

# dr_mean, dr_mean_sub, dr_std = getmeanstd(data, 'drtv')
# nf_mean, nf_mean_sub, nf_std = getmeanstd(data, 'netflix')
# mean_diff = dr_mean - nf_mean
# sort_diff = (-abs(mean_diff)).argsort() #Sort on absolute values in decending order
def read_data_sets(train_dirs=[],
                   test_dirs=None,
                   merge_data=True,
                   one_hot=False,
                   dtype=dtypes.float32,
                   validation_size=0.2,
                   test_size=0.2,
                   seed=None,
                   balance_classes=False,
                   payload_length=810):
    trainframes = []
    testframes = []
    for train_dir in train_dirs:
        for fullname in glob.iglob(train_dir + '*.h5'):
            filename = os.path.basename(fullname)
            df = utils.load_h5(train_dir, filename)
            trainframes.append(df)
        # create one large dataframe
    train_data = pd.concat(trainframes)
    if test_dirs != train_dirs:
        for test_dir in test_dirs:
            for fullname in glob.iglob(test_dir + '*.h5'):
                filename = os.path.basename(fullname)
                df = utils.load_h5(test_dir, filename)
                testframes.append(df)
        test_data = pd.concat(testframes)
    else:
        test_data = pd.DataFrame()

    if merge_data:
        train_data = pd.concat([test_data, train_data])

    num_classes = len(train_data['label'].unique())

    if balance_classes:
        values, counts = np.unique(train_data['label'], return_counts=True)
        smallest_class = np.argmin(counts)
        amount = counts[smallest_class]
        new_data = []
        for v in values:
            sample = train_data.loc[train_data['label'] == v].sample(n=amount)
            new_data.append(sample)
        train_data = new_data
        train_data = pd.concat(train_data)

    # shuffle the dataframe and reset the index
    train_data = train_data.sample(frac=1,
                                   random_state=seed).reset_index(drop=True)
    #
    # youtube_selector = train_data['label'] == 'youtube'
    # youtube_data = train_data[youtube_selector]
    # for index, row in youtube_data.iterrows():
    #     bytes = row[0]
    #     if bytes[23] == 17.0:
    #         train_data.loc[index, 'label'] = 'youtube_udp'
    #     else:
    #         train_data.loc[index, 'label'] = 'youtube_tcp'

    if test_dirs != train_dirs:
        test_data = test_data.sample(frac=1,
                                     random_state=seed).reset_index(drop=True)
        test_labels = extract_labels(test_data,
                                     one_hot=one_hot,
                                     num_classes=num_classes)
        test_payloads = test_data['bytes'].values
        test_payloads = utils.pad_arrays_with_zero(
            test_payloads, payload_length=payload_length)
    train_labels = extract_labels(train_data,
                                  one_hot=one_hot,
                                  num_classes=num_classes)
    train_payloads = train_data['bytes'].values
    # pad with zero up to payload_length length
    train_payloads = utils.pad_arrays_with_zero(train_payloads,
                                                payload_length=payload_length)

    # TODO make seperate TEST SET ONCE ready
    total_length = len(train_payloads)
    validation_amount = int(total_length * validation_size)
    if merge_data:
        test_amount = int(total_length * test_size)
        test_payloads = train_payloads[:test_amount]
        test_labels = train_labels[:test_amount]
        val_payloads = train_payloads[test_amount:(validation_amount +
                                                   test_amount)]
        val_labels = train_labels[test_amount:(validation_amount +
                                               test_amount)]
        train_payloads = train_payloads[(validation_amount + test_amount):]
        train_labels = train_labels[(validation_amount + test_amount):]
    else:
        val_payloads = train_payloads[:validation_amount]
        val_labels = train_labels[:validation_amount]
        train_payloads = train_payloads[validation_amount:]
        train_labels = train_labels[validation_amount:]

    options = dict(dtype=dtype, seed=seed)
    print("Training set size: {0}".format(len(train_payloads)))
    print("Validation set size: {0}".format(len(val_payloads)))
    print("Test set size: {0}".format(len(test_payloads)))
    train = DataSet(train_payloads, train_labels, **options)
    validation = DataSet(val_payloads, val_labels, **options)
    test = DataSet(test_payloads, test_labels, **options)

    return base.Datasets(train=train, validation=validation, test=test)