def dataset2npyfilse(): """ for using with data generator""" trainset_path = './trainset/pdfobjs.txt' trainset_path = './trainset/pdf_object_trainset_100_to_500_percent10.txt' text = preprocess.load_from_file(trainset_path) print('corpus length:', len(text)) chars = sorted(list(set(text))) print('Total chars:', len(chars)) # print(chars) # Vectorization print('Building dictionary index ...') char_indices = dict((c, i) for i, c in enumerate(chars)) # print(char_indices) indices_char = dict((i, c) for i, c in enumerate(chars)) # print(indices_char) # cut the text in semi-redundant sequences of maxlen characters maxlen = 100 # Good idea: use ave_object_len to determine this hyper-parameter step = 1 # should set to 1 for best result epochs = 10 # number of epochs for training sentences = [] # list of all sentence as input next_chars = [] # list of all next chars as labels for i in range(0, len(text) - maxlen, step): # arg2 why this? sentences.append(text[i:i + maxlen]) preprocess.save_to_file('./npysamples/IDs/id-' + str(i), text[i:i + maxlen]) next_chars.append(text[i + maxlen]) preprocess.save_to_file('./npysamples/Labels/id-' + str(i), text[i + maxlen]) print('semi sequences:', len(sentences)) print('end...')
def generate_and_fuzz_new_samples(self, model=None, model_name='model_1', epochs=1, current_epoch=1, dir_name=None): """ sampling the model and generate new object :param model: The model which is training. :param model_name: Name of model (base on hyperparameters config in deep_model.py file) e.g. [model_1, model_2, ...] :param epochs: Number of total epochs of training, e.g. 10,20,30,40,50 or 60 :param current_epoch: Number of current epoch :param dir_name: root directory for this running. :return: Nothing """ # End time of current epoch dt = datetime.datetime.now().strftime('_date_%Y-%m-%d_%H-%M-%S') dir_name = dir_name + 'epoch_' + str(current_epoch) + dt + '/' if not os.path.exists(dir_name): os.makedirs(dir_name) # Fuzzing hyper-parameters diversities = [i * 0.10 for i in range(1, 20, 2)] # diversities = [0.2, 0.5, 1.0, 1.2, 1.5, 1.8] # diversities = [0.5, 1.0, 1.5] # for sou and for mou diversities = [1.0] generated_obj_total = 1100 # [5, 10, 100, 1000, 3000] {1000-1100 for sou and 3000-3100 for muo} generated_obj_with_same_prefix = 1 # [1, 5, 10, 20, 40] {10 for sou and 20 for mou} generated_obj_max_allowed_len = 400 # Choose max allowed len for object randomly exclude_from_fuzzing_set = {'s', 't', 'r', 'e', 'a', 'm'} # set(['s', 't', 'r', 'e', 'a', 'm']) # Learn and fuzz paper hyper-parameters t_fuzz = 0.9 # For comparision with p_fuzz where p_fuzz is a random number (if p_fuzz > t_fuzz) p_t = 0.9 # 0.9 and more for format fuzzing; 0.5 and less than 0.5 for data fuzzing. Now format fuzzing. # End of fuzzing hyper-parameters testset_objects_list = preprocess.get_list_of_object(self.text_test) testset_object_gt_maxlen_list = [] for obj in testset_objects_list: if len(obj) > self.maxlen + len(' endobj'): testset_object_gt_maxlen_list.append(obj) print('len filtered test-set: ', len(testset_object_gt_maxlen_list)) generated_total = '' for diversity in diversities: generated_total = '' for q in range( round(generated_obj_total / generated_obj_with_same_prefix)): obj_index = random.randint( 0, len(testset_object_gt_maxlen_list) - 1) # obj_index = 0 generated_obj_counter = 0 generated_obj_len = 0 generated = '' stop_condition = False endobj_attach_manually = False # print() print('-- Diversity:', diversity) obj_prefix = str( testset_object_gt_maxlen_list[obj_index])[0:self.maxlen] generated += obj_prefix # prob_vals = '1 ' * self.maxlen # learnt_grammar = obj_prefix # print('--- Generating ts_text with seed:\n "' + obj_prefix + '"') # sys.stdout.write(generated) if generated.endswith('endobj'): generated_obj_counter += 1 if generated_obj_counter > generated_obj_with_same_prefix: stop_condition = True while not stop_condition: x_pred = np.zeros((1, self.maxlen, len(self.chars))) for t, char in enumerate(obj_prefix): x_pred[0, t, self.char_indices[char]] = 1. preds = model.predict(x_pred, verbose=0)[0] next_index, prob, preds2 = self.sample(preds, diversity) next_char = self.indices_char[next_index] next_char_for_prefix = next_char ###### Fuzzing section we don't need it yet! # if next_char not in exclude_from_fuzzing_set: # p_fuzz = random.random() # if p_fuzz > t_fuzz and preds2[next_index] > p_t: # next_index = np.argmin(preds2) # print('((Fuzz!))') # next_char = self.indices_char[next_index] ###### End of fuzzing section # print() # print(preds2) # print(np.argmax(preds)) # print(preds[np.argmax(preds)]) # print(prob) # print(np.argmax(prob)) # print('====>',next_index) # print(prob[0, next_index]) # prob_vals += str(preds2[next_index]) + '\n' # if preds2[next_index] > 0.9980: # learnt_grammar += next_char # else: # learnt_grammar += '.' # input() obj_prefix = obj_prefix[1:] + next_char_for_prefix generated += next_char_for_prefix # next_char generated_obj_len += 1 if generated.endswith('endobj'): generated_obj_counter += 1 generated_obj_len = 0 elif (generated.endswith('endobj') is False) and \ (generated_obj_len > generated_obj_max_allowed_len): # Attach '\nendobj\n' manually, and reset obj_prefix generated += '\nendobj\n' generated_obj_counter += 1 generated_obj_len = 0 endobj_attach_manually = True if generated_obj_counter >= generated_obj_with_same_prefix: # Fix: Change > to >= (13970315) stop_condition = True elif endobj_attach_manually: # Reset prefix: # Here we need to modify obj_prefix because we manually change the generated_obj! # Below we add this new repair: # obj_prefix = obj_prefix[len('\nendobj\n'):] + '\nendobj\n' # Instead of modify obj_prefix we can reset prefix if we found that 'endobj' dose not generate # automatically. It seems to be better option, so we do this: # obj_index = random.randint(0, len(testset_object_gt_maxlen_list) - 1) obj_index = 0 obj_prefix = str( testset_object_gt_maxlen_list[obj_index])[0:self. maxlen] generated += obj_prefix endobj_attach_manually = False # sys.stdout.write(next_char) # sys.stdout.flush() # print() generated_total += generated + '\n' # save generated_result to file inside program file_name = model_name \ + '_diversity_' + repr(diversity) \ + '_epochs_' + repr(epochs) \ + '_step_' + repr(self.step) \ + '.txt' preprocess.save_to_file(dir_name + file_name, generated_total) # preprocess.save_to_file(dir_name + file_name + 'probabilities.txt', prob_vals) # preprocess.save_to_file(dir_name + file_name + 'learntgrammar.txt',learnt_grammar) print('Diversity %s save to file successfully.' % diversity) print('End of generation method.') print('Starting new epoch ...') return generated_total
def train(): trainset_path = './trainset/pdfobjs.txt' trainset_path = './trainset/pdf_object_trainset_100_to_500_percent01.txt' text = poc.load_from_file(trainset_path) print('corpus length:', len(text)) chars = sorted(list(set(text))) print('Total chars:', len(chars)) # print(chars) # Vectorization print('Building dictionary index ...') char_indices = dict((c, i) for i, c in enumerate(chars)) # print(char_indices) indices_char = dict((i, c) for i, c in enumerate(chars)) # print(indices_char) # cut the text in semi-redundant sequences of maxlen characters maxlen = 50 # Good idea: use ave_object_len to determine this hyper-parameter step = 1 # should set to 1 for best result epochs = 10 # number of epochs for training sentences = [] # list of all sentence as input next_chars = [] # list of all next chars as labels for i in range(0, len(text) - maxlen, step): # arg2 why this? sentences.append(text[i:i + maxlen]) # print(sentences) next_chars.append(text[i + maxlen]) # print(next_chars) print('semi sequences:', len(sentences)) print('One-Hot vectorization...') x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool) # input x y = np.zeros((len(sentences), len(chars)), dtype=np.bool) # output label y for i, sentence in enumerate(sentences): for t, char in enumerate(sentence): x[i, t, char_indices[char]] = 1 y[i, char_indices[next_chars[i]]] = 1 # build the model: a single LSTM layer # we need to deep it print('Build model...') model = Sequential() # model.add(LSTM(128, input_shape=(maxlen, len(chars)))) model.add( LSTM(128, input_shape=(maxlen, len(chars)), return_sequences=True)) # model.add(LSTM(128, input_shape=(maxlen, len(chars)), # activation='relu', return_sequences=True, dropout=0.2)) model.add(LSTM(128, input_shape=(maxlen, len(chars)))) # model.add(LSTM(128, activation='relu', dropout=0.2)) model.add(Dense(len(chars))) model.add(Activation('softmax')) optimizer = RMSprop(lr=0.01) model.compile(loss='categorical_crossentropy', optimizer=optimizer) input() # sys.exit() model.fit(x, y, batch_size=128, epochs=epochs, validation_split=0.2) # why epochs=? save(model, epochs) # del model # model = load_model('./modelh5/lstm_text_generation_pdf_objs_1_20180214_235713_epochs10.h5') """ sampling the model and generate new object """ diversities = [0.2, 0.5, 1.0, 1.2, 1.5, 1.8] # diversities = [0.1, 0.2, 0.3, 0.5, 0.7, 1, 1.2, 1.5, 1.7, 2] generated_obj_max_number = 5 generated_obj_max_allowed_len = 500 t_fuzz = 0.9 p_t = 0.9 # 0.9 for format fuzzing and 0.5 or letter for data fuzzing. Now format fuzzing list_of_objects = poc.get_list_of_object(text) list_of_objects_with_maxlen = [] for o in list_of_objects: if len(o) > maxlen: list_of_objects_with_maxlen.append(o) for diversity in diversities: obj_index = random.randint(0, len(list_of_objects_with_maxlen) - 1) generated_obj_counter = 0 generated_obj_len_index = 0 stop_condition = False print() print('-- Diversity:', diversity) # generated = '' obj_prefix = str(list_of_objects_with_maxlen[obj_index])[ 0:maxlen] # len(sentence) equals 100 here generated = obj_prefix prob_vals = '100\n' * maxlen learnt_grammar = obj_prefix print('--- Generating text with seed:\n "' + obj_prefix + '"') sys.stdout.write(generated) if generated.endswith('endobj'): generated_obj_counter += 1 if generated_obj_counter > generated_obj_max_number: stop_condition = True while not stop_condition: x_pred = np.zeros((1, maxlen, len(chars))) for t, char in enumerate(obj_prefix): x_pred[0, t, char_indices[char]] = 1. preds = model.predict(x_pred, verbose=0)[0] next_index, prob, preds2 = sample(preds, diversity) p_fuzz = random.random() if p_fuzz > t_fuzz and preds2[next_index] > p_t: next_index = np.argmin(preds2) print('FUZZ DONE!') next_char = indices_char[next_index] # print() # print(preds2) # print(np.argmax(preds)) # print(preds[np.argmax(preds)]) # print(prob) # print(np.argmax(prob)) # print('====>',next_index) # print(prob[0, next_index]) # prob_vals += str(preds2[next_index]) + '\n' # if preds2[next_index] > 0.9980: # learnt_grammar += next_char # else: # learnt_grammar += '.' # input() obj_prefix = obj_prefix[1:] + next_char generated += next_char generated_obj_len_index += 1 if generated.endswith('endobj'): generated_obj_counter += 1 generated_obj_len_index = 0 elif generated_obj_len_index > generated_obj_max_allowed_len: generated += '\nendobj\n' generated_obj_counter += 1 generated_obj_len_index = 0 if generated_obj_counter > generated_obj_max_number: stop_condition = True sys.stdout.write(next_char) sys.stdout.flush() # save generated text to file inside program dt = datetime.datetime.now().strftime('_%Y%m%d_%H%M%S_') dir_name = './generated_results/pdfobjs_new/' file_name = 'gen_objs' + dt + 'epochs' + repr(epochs) + '_div' \ + repr(diversity) + '_step' + repr(step) + '.txt' poc.save_to_file(dir_name + file_name, generated)