def train(hidden_size, batch_size): batcher = Batcher() print('Data:') print(batcher.inputs.shape) print(batcher.targets.shape) model = get_model(hidden_size, batcher.chars_len()) model.compile(loss={ 'op': 'categorical_crossentropy', 'char': 'categorical_crossentropy' }, optimizer='adam', metrics=['accuracy']) model.summary() for grad_step in range(int(1e9)): ppp = gen_large_chunk_single_thread(batcher, batcher.inputs, batcher.targets, chunk_size=batch_size) x_train, y_train_1, y_train_2, x_val, y_val_1, y_val_2, val_sub_inputs, val_sub_targets = ppp model.train_on_batch(x=x_train, y=[y_train_1, y_train_2]) print( dict( zip(model.metrics_names, model.test_on_batch(x=x_val, y=[y_val_1, y_val_2])))) # guess = c_table.decode(preds[0], calc_argmax=False) # top_passwords = predict_top_most_likely_passwords_monte_carlo(model, row_x, 100) # p = model.predict(row_x, batch_size=32, verbose=0)[0] # p.shape (12, 82) # [np.random.choice(a=range(82), size=1, p=p[i, :]) for i in range(12)] # s = [np.random.choice(a=range(82), size=1, p=p[i, :])[0] for i in range(12)] # c_table.decode(s, calc_argmax=False) # Could sample 1000 and take the most_common() if grad_step % 100 == 0: row_x, password_target, password_input = x_val, val_sub_targets, val_sub_inputs ops, char = model.predict(row_x, verbose=0) predicted_chars = list(batcher.decode(char)) ops = ops.argmax(axis=1) decoded_op = [] for op in ops: if op == 0: decoded_op.append('insert') elif op == 1: decoded_op.append('replace') else: decoded_op.append('delete') for i, (x, y, pc, po) in enumerate( zip(password_input, password_target, predicted_chars, decoded_op)): print('x :', x) print('y :', y) print('predict char :', pc) print('predict op :', po) print('---------------------') if i >= 100: break
def predict_top_most_likely_passwords(sed: Batcher, model, row_x, n): p = model.predict(row_x, batch_size=32, verbose=0)[0] most_likely_passwords = [] for i in range(n): # of course should take the edit distance constraint. pa = np.array([ np.random.choice(a=range(sed.ENCODING_MAX_SIZE_VOCAB + 2), size=1, p=p[j, :]) for j in range(sed.ENCODING_MAX_PASSWORD_LENGTH) ]).flatten() most_likely_passwords.append(sed.decode(pa, calc_argmax=False)) return most_likely_passwords