def load_dataset(self): """ Load all 3 part of each dataset and building dictionary index """ if learning_config['dataset_size'] == 'small': self.text_training = preprocess.load_from_file( learning_config['small_training_set_path']) self.text_validation = preprocess.load_from_file( learning_config['small_validation_set_path']) self.text_test = preprocess.load_from_file( learning_config['small_testing_set_path']) elif learning_config['dataset_size'] == 'medium': self.text_training = preprocess.load_from_file( learning_config['medium_training_set_path']) self.text_validation = preprocess.load_from_file( learning_config['medium_validation_set_path']) self.text_test = preprocess.load_from_file( learning_config['medium_testing_set_path']) elif learning_config['dataset_size'] == 'large': self.text_training = preprocess.load_from_file( learning_config['large_training_set_path']) self.text_validation = preprocess.load_from_file( learning_config['large_validation_set_path']) self.text_test = preprocess.load_from_file( learning_config['large_testing_set_path']) self.text_all = self.text_training + self.text_validation + self.text_test print('Total corpus length:', len(self.text_all)) self.chars = sorted(list(set(self.text_all))) print('Total corpus chars:', len(self.chars)) # print(chars) # Building dictionary index print('Building dictionary index ...') self.char_indices = dict((c, i) for i, c in enumerate(self.chars)) # print(char_indices) self.indices_char = dict((i, c) for i, c in enumerate(self.chars))
def __init__(self, dim_x=32, dim_y=32, dim_z=32, batch_size=32, shuffle=True): 'Initialization' self.dim_x = dim_x self.dim_y = dim_y self.dim_z = dim_z self.batch_size = batch_size self.shuffle = shuffle """ for using with data generator""" trainset_path = './trainset/pdfobjs.txt' trainset_path = './trainset/pdf_object_trainset_100_to_500_percent10.txt' self.text = poc.load_from_file(trainset_path) print('corpus length:', len(self.text)) self.chars = sorted(list(set(self.text))) print('Total chars:', len(self.chars)) # Vectorization print('Building dictionary index ...') self.char_indices = dict((c, i) for i, c in enumerate(self.chars)) # print(char_indices) self.indices_char = dict((i, c) for i, c in enumerate(self.chars)) # print(indices_char) # cut the text in semi-redundant sequences of maxlen characters self.maxlen = 100 # Good idea: use ave_object_len to determine this hyper-parameter
def __init__(self, host_id=None, object_file_path=iu_config['baseline_object_path'], stream_directory_path=iu_config['stream_directory_path']): """ :param host_id: Name of host file without postfix, e.g. host1_max, host2_min or host3_avg :param object_file_path: See iu_config, new_objects_path :param stream_directory_path: See iu_config, stream_directory_path """ self.host_id = host_id self.object_file_path = object_file_path self.obj_list = preprocess.get_list_of_object(seq=preprocess.load_from_file(self.object_file_path), is_sort=False) self.stream_directory_path = '../' + stream_directory_path self.stream_filename_list = os.listdir(self.stream_directory_path) # Creating new directory foreach time that program run and we want to generate new test data dt = datetime.datetime.now().strftime(self.host_id + '_date_%Y-%m-%d_%H-%M-%S') self.storage_dir_name = iu_config['new_pdfs_directory'] + self.host_id + '/' + dt + '/' if not os.path.exists(self.storage_dir_name): os.makedirs(self.storage_dir_name) print('new storage directory build.') self.obj_getter = self.obj_generator(self.obj_list)
def dataset2npyfilse(): """ for using with data generator""" trainset_path = './trainset/pdfobjs.txt' trainset_path = './trainset/pdf_object_trainset_100_to_500_percent10.txt' text = preprocess.load_from_file(trainset_path) print('corpus length:', len(text)) chars = sorted(list(set(text))) print('Total chars:', len(chars)) # print(chars) # Vectorization print('Building dictionary index ...') char_indices = dict((c, i) for i, c in enumerate(chars)) # print(char_indices) indices_char = dict((i, c) for i, c in enumerate(chars)) # print(indices_char) # cut the text in semi-redundant sequences of maxlen characters maxlen = 100 # Good idea: use ave_object_len to determine this hyper-parameter step = 1 # should set to 1 for best result epochs = 10 # number of epochs for training sentences = [] # list of all sentence as input next_chars = [] # list of all next chars as labels for i in range(0, len(text) - maxlen, step): # arg2 why this? sentences.append(text[i:i + maxlen]) preprocess.save_to_file('./npysamples/IDs/id-' + str(i), text[i:i + maxlen]) next_chars.append(text[i + maxlen]) preprocess.save_to_file('./npysamples/Labels/id-' + str(i), text[i + maxlen]) print('semi sequences:', len(sentences)) print('end...')
def get_one_object(): """ provide one pdf data object whether an existing object in corpus or an online new generated object from learnt model this function is not complete yet! """ object_file_path = '../trainset/pdf_object_trainset_100_to_500_percent33.txt' seq = poc.load_from_file(object_file_path) obj_list = poc.get_list_of_object(seq, is_sort=False) random_object_index = random.randint(50, len(obj_list) - 1) obj = obj_list[random_object_index] return obj
def __data_generation(self, labels, list_IDs_temp): 'Generates data of batch_size samples' # X : (n_samples, v_size, v_size, v_size, n_channels) # Initialization X = np.empty((self.batch_size, self.dim_x, self.dim_y, self.dim_z, 1)) y = np.empty((self.batch_size), dtype=int) # Generate data for i, ID in enumerate(list_IDs_temp): # Store volume text = poc.load_from_file('./npysamples/IDs/' + ID) labe = poc.load_from_file('./npysamples/Labels/' + ID) x = np.zeros((1, self.maxlen, len(self.chars)), dtype=np.bool) # input x y = np.zeros((1, len(self.chars)), dtype=np.bool) # output label y for t, char in enumerate(text): x[0, t, self.char_indices[char]] = 1 y[0, self.char_indices[next_chars[i]]] = 1 X[i, :, :, :, 0] = np.load(ID + '.npy') # Store class y[i] = labels[ID] return X, sparsify(y)
def __get_objects_sequence(self): seq = '' for filename in os.listdir(iu_config['new_objects_path']): try: seq += preprocess.load_from_file( iu_config['new_objects_path'] + filename) except Exception as e: print('Extracting failed from %s:' % filename, file=sys.stderr) print(str(e), file=sys.stderr) # finally: obj_list = preprocess.get_list_of_object(seq=seq, is_sort=False) print('obj_list len', len(obj_list)) print(obj_list) # for o in obj_list: # print(o, '\n', '#'*50) # input() return obj_list
""" from __future__ import print_function from keras.models import Model from keras.layers import Input, LSTM, Dense import numpy as np import pdf_object_preprocess as preprocess from config import learning_config batch_size = 128 # Batch size for training. epochs = 100 # Number of epochs to train for. latent_dim = 128 # Latent dimensionality of the encoding space. num_samples = 10000 # Number of samples to train on. # Path to the data txt file on disk. text_training = preprocess.load_from_file(learning_config['small_training_set_path']) \ + preprocess.load_from_file(learning_config['small_validation_set_path']) d = 50 step = 1 # Vectorize the data. input_texts = [] target_texts = [] input_characters = set() target_characters = set() for i in range(0, len(text_training) - d, step): input_text = text_training[i * d:(i + 1) * d] target_text = text_training[i * d + 1:(i + 1) * d + 1] # We use "tab" as the "start sequence" character
def attach_new_object(): """ incremental update pdf file """ host_names = ['host1', 'host2', 'host3'] with open(host_directory + host_names[0] + '.pdf', 'br') as f: data = f.read() print(len(data)) # find last trailer in a pdf file trailer_index = 0 while data.find(b'trailer', trailer_index + 7) != -1: trailer_index = data.find(b'trailer', trailer_index + 7) print('trailer_index', trailer_index) trailer_index_dic_endof = data.find(b'>>', trailer_index) print('trailer_index_dic_endof', trailer_index_dic_endof) trailer_content = data[trailer_index:trailer_index_dic_endof + 2] print('trailer_content', trailer_content) # find last startxref offset in a pdf file startxref_index = trailer_index while data.find(b'startxref', startxref_index + 9) != -1: startxref_index = data.find(b'startxref', startxref_index + 9) # print('index ===', index_startxref) index_eof = data.find(b'%%EOF', startxref_index) # print('index 2===', index_eof) if data[startxref_index + 9] == b'\n' or b'\r': # print('yes', data[index_startxref+9]) startxref_index += 10 if data[index_eof - 1] == b'\n' or b'\r': index_eof -= 1 startxref_offset = int(data[startxref_index:index_eof]) print('startxref_offset', startxref_offset) # print(type(trailer_content)) trailer_content_new = trailer_content[:-2] + b' /Prev ' \ + bytes(str(startxref_offset), 'ascii') + b' \n>>' print('trailer_content_new', trailer_content_new) # print(bytes(str(startxref_offset), 'ascii')) # load the pdf object form file seq = poc.load_from_file( host_directory + 'gen_objs_20180221_142612_epochs10_div1.5_step1.txt') obj_list = poc.get_list_of_object(seq) random_object_index = random.randint(0, len(obj_list) - 1) obj = obj_list[random_object_index] last_object_id = str(get_last_object_id(host_names[0])) random_rewrite_object = str(random.randint(1, int(last_object_id))) print('len object', len(obj)) startxref_offset_new = len(data) + 1 + len( random_rewrite_object) + 3 + len(obj) # if we attach just one obj print('startxref_offset_new', startxref_offset_new) attach_content = bytes(str(random_rewrite_object + ' 0 ' + obj + '\nxref\n0 1\n0000000000 65535 f\n' +\ random_rewrite_object + ' 1\n' + str(len(data)).zfill(10) + ' 00000 n\n'), 'ascii') +\ trailer_content_new + b'\nstartxref\n' + \ bytes(str(startxref_offset_new), 'ascii') + b'\n%%EOF\n' print('attach_content\n', attach_content) new_pdf_file = data + attach_content with open(host_directory + host_names[0] + 'iu_auto7.pdf', 'bw') as f: f.write(new_pdf_file)
def train(): trainset_path = './trainset/pdfobjs.txt' trainset_path = './trainset/pdf_object_trainset_100_to_500_percent01.txt' text = poc.load_from_file(trainset_path) print('corpus length:', len(text)) chars = sorted(list(set(text))) print('Total chars:', len(chars)) # print(chars) # Vectorization print('Building dictionary index ...') char_indices = dict((c, i) for i, c in enumerate(chars)) # print(char_indices) indices_char = dict((i, c) for i, c in enumerate(chars)) # print(indices_char) # cut the text in semi-redundant sequences of maxlen characters maxlen = 50 # Good idea: use ave_object_len to determine this hyper-parameter step = 1 # should set to 1 for best result epochs = 10 # number of epochs for training sentences = [] # list of all sentence as input next_chars = [] # list of all next chars as labels for i in range(0, len(text) - maxlen, step): # arg2 why this? sentences.append(text[i:i + maxlen]) # print(sentences) next_chars.append(text[i + maxlen]) # print(next_chars) print('semi sequences:', len(sentences)) print('One-Hot vectorization...') x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool) # input x y = np.zeros((len(sentences), len(chars)), dtype=np.bool) # output label y for i, sentence in enumerate(sentences): for t, char in enumerate(sentence): x[i, t, char_indices[char]] = 1 y[i, char_indices[next_chars[i]]] = 1 # build the model: a single LSTM layer # we need to deep it print('Build model...') model = Sequential() # model.add(LSTM(128, input_shape=(maxlen, len(chars)))) model.add( LSTM(128, input_shape=(maxlen, len(chars)), return_sequences=True)) # model.add(LSTM(128, input_shape=(maxlen, len(chars)), # activation='relu', return_sequences=True, dropout=0.2)) model.add(LSTM(128, input_shape=(maxlen, len(chars)))) # model.add(LSTM(128, activation='relu', dropout=0.2)) model.add(Dense(len(chars))) model.add(Activation('softmax')) optimizer = RMSprop(lr=0.01) model.compile(loss='categorical_crossentropy', optimizer=optimizer) input() # sys.exit() model.fit(x, y, batch_size=128, epochs=epochs, validation_split=0.2) # why epochs=? save(model, epochs) # del model # model = load_model('./modelh5/lstm_text_generation_pdf_objs_1_20180214_235713_epochs10.h5') """ sampling the model and generate new object """ diversities = [0.2, 0.5, 1.0, 1.2, 1.5, 1.8] # diversities = [0.1, 0.2, 0.3, 0.5, 0.7, 1, 1.2, 1.5, 1.7, 2] generated_obj_max_number = 5 generated_obj_max_allowed_len = 500 t_fuzz = 0.9 p_t = 0.9 # 0.9 for format fuzzing and 0.5 or letter for data fuzzing. Now format fuzzing list_of_objects = poc.get_list_of_object(text) list_of_objects_with_maxlen = [] for o in list_of_objects: if len(o) > maxlen: list_of_objects_with_maxlen.append(o) for diversity in diversities: obj_index = random.randint(0, len(list_of_objects_with_maxlen) - 1) generated_obj_counter = 0 generated_obj_len_index = 0 stop_condition = False print() print('-- Diversity:', diversity) # generated = '' obj_prefix = str(list_of_objects_with_maxlen[obj_index])[ 0:maxlen] # len(sentence) equals 100 here generated = obj_prefix prob_vals = '100\n' * maxlen learnt_grammar = obj_prefix print('--- Generating text with seed:\n "' + obj_prefix + '"') sys.stdout.write(generated) if generated.endswith('endobj'): generated_obj_counter += 1 if generated_obj_counter > generated_obj_max_number: stop_condition = True while not stop_condition: x_pred = np.zeros((1, maxlen, len(chars))) for t, char in enumerate(obj_prefix): x_pred[0, t, char_indices[char]] = 1. preds = model.predict(x_pred, verbose=0)[0] next_index, prob, preds2 = sample(preds, diversity) p_fuzz = random.random() if p_fuzz > t_fuzz and preds2[next_index] > p_t: next_index = np.argmin(preds2) print('FUZZ DONE!') next_char = indices_char[next_index] # print() # print(preds2) # print(np.argmax(preds)) # print(preds[np.argmax(preds)]) # print(prob) # print(np.argmax(prob)) # print('====>',next_index) # print(prob[0, next_index]) # prob_vals += str(preds2[next_index]) + '\n' # if preds2[next_index] > 0.9980: # learnt_grammar += next_char # else: # learnt_grammar += '.' # input() obj_prefix = obj_prefix[1:] + next_char generated += next_char generated_obj_len_index += 1 if generated.endswith('endobj'): generated_obj_counter += 1 generated_obj_len_index = 0 elif generated_obj_len_index > generated_obj_max_allowed_len: generated += '\nendobj\n' generated_obj_counter += 1 generated_obj_len_index = 0 if generated_obj_counter > generated_obj_max_number: stop_condition = True sys.stdout.write(next_char) sys.stdout.flush() # save generated text to file inside program dt = datetime.datetime.now().strftime('_%Y%m%d_%H%M%S_') dir_name = './generated_results/pdfobjs_new/' file_name = 'gen_objs' + dt + 'epochs' + repr(epochs) + '_div' \ + repr(diversity) + '_step' + repr(step) + '.txt' poc.save_to_file(dir_name + file_name, generated)