def execute_with_conf(conf): # load program analysis results, format: app, image, layout, permissions print('loading program analysis results') data_pa = load_data(conf.path_pa) # extract drawable images print('extracting drawable images') log_level = check_conf(conf.log_level, {0, 1, 2}, 0) drawable_images = extract_drawable_images(data_pa, conf.path_app, conf.log_level) # extract texts, format: layout_texts, embedded_texts, resource_texts print('extracting texts') search_range = check_conf(conf.layout_text_range, {'parent', 'total'}, 'parent') enable_ocr_cache = check_conf(conf.enable_ocr_cache, {True, False}, True) is_translate = check_conf(conf.enable_translate, {True, False}, True) enable_translate_cache = check_conf(conf.enable_translate_cache, {True, False}, True) texts = extract_contextual_texts(data_pa, drawable_images, conf.path_app, conf.path_east, search_range, (conf.ocr_width, conf.ocr_height), conf.ocr_padding, enable_ocr_cache, is_translate, enable_translate_cache, log_level) # merge and save the triple, <image, texts, permissions> print('finished and save') assert len(data_pa) == len(drawable_images) == len(texts) # format: [(compressed_img), [[layout_texts], [embedded_texts], [res_texts]], {permissions}] data = [[drawable_images[i]] + [texts[i]] + [data_pa[i][-1]] for i in range(len(data_pa))] save_pkl_data(conf.path_save, data)
def predict(predict_conf): # load data _, data = load_pkl_data(predict_conf.path_data) # load model meta data meta = load_pkl_data(predict_conf.path_meta) meta_image_shape = meta['ModelConf'].img_shape meta_re_sample_type = meta['ModelConf'].img_re_sample meta_text_len = meta['ModelConf'].text_length meta_label_num = len(meta['label2id']) meta_id2label = {v: k for k, v in meta['label2id'].items()} # load model model = keras.models.load_model(predict_conf.path_model, custom_objects={ "CoAttentionParallel": CoAttentionParallel }) # prepare data _, _, data_test = prepare_data(data, meta_image_shape, meta_re_sample_type, meta_text_len, meta_label_num, 0, 0) # predict with trained model x_test, y_test = data_test y_predict = model.predict(x_test) y_true = y_test.tolist() # save predictions save_pkl_data(predict_conf.path_predictions, [y_predict, y_test]) # print metric results scores = evaluate(y_true, y_predict, predict_conf.threshold) label_names = [meta_id2label[i] for i in range(len(meta_id2label))] display_scores(scores, label_names)
def pre_process_save_results(path_out, data, v2id, l2id, wid_name_list): if debug == True: print("v2id") print(v2id) print("l2id") print(l2id) print(len(data), len(v2id), len(l2id)) save_pkl_data(path_out, [(v2id, l2id), data, wid_name_list])
def pre_process_save_results(path_out, data, v2id, l2id): save_pkl_data(path_out, [(v2id, l2id), data])
def train(model_conf, train_conf): # set up random seed random.seed(train_conf.random_seed) # check the output path if not os.path.exists(train_conf.path_output): os.makedirs(train_conf.path_output) # load and statistics (vocab2id, label2id), data = load_pkl_data(train_conf.path_data) id2vocab = {v: k for k, v in vocab2id.items()} id2label = {v: k for k, v in label2id.items()} token_size, label_size = len(id2vocab), len(id2label) label_names = [id2label[i] for i in range(len(id2label))] print('label size:', label_size, 'token size:', token_size) print('label names:', label_names) # split data train_ratio, valid_ratio, test_ratio = normalize_data_ratio( train_conf.train_ratio, train_conf.valid_ratio, train_conf.test_ratio) data_train, data_valid, data_test = prepare.prepare_data( data, model_conf.img_shape, model_conf.img_re_sample, model_conf.text_length, label_size, train_ratio, valid_ratio) (x_train, y_train), (x_valid, y_valid), (x_test, y_test) = data_train, data_valid, data_test print('train: {0}; valid: {1}; test: {2}'.format(len(y_train), len(y_valid), len(y_test))) # train and test scores = [] predict_threshold = 0.5 for i in range(train_conf.repeat_times): print('{sp}\ntime {i}\n{sp}'.format(sp='=' * 20, i=i)) # prefix to save the training process path_prefix = os.path.join( train_conf.path_output, 'model_{}_{}'.format(train_conf.code_name, i)) # create and train the model model = create_model_with_conf(token_size, label_size, model_conf) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # init callbacks path_cp = path_prefix + '.cp' es = EarlyStopping(monitor=train_conf.monitor_type, patience=train_conf.early_stop_patients) cp = ModelCheckpoint(filepath=path_cp, monitor=train_conf.monitor_type, save_best_only=True) # fit the model history = model.fit(x_train, y_train, batch_size=train_conf.batch_size, epochs=train_conf.epochs, verbose=train_conf.verbose, validation_data=(x_valid, y_valid), callbacks=[cp, es]) # save training history save_on_condition(train_conf.is_log_history, path_prefix + '.his', history.history) # save the trained model model.save(path_prefix + '.h5') # save the training meta data, e.g., TrainConf, vocab2id, label2id save_pkl_data(path_prefix + '.meta', { 'ModelConf': model_conf, 'vocab2id': vocab2id, 'label2id': label2id }) # test if test_ratio > 0 if test_ratio > 0: # predict with trained model model.load_weights(path_cp) y_predict = model.predict(x_test) y_true = y_test.tolist() # save prediction if train_conf.is_log_prediction: path_predict = path_prefix + '.predictions' save_pkl_data(path_predict, [y_predict, y_test]) # evaluate scores_current = metrics.evaluate(y_true, y_predict, predict_threshold) metrics.display_scores(scores_current, label_names) scores.append(scores_current) # prepare for the next loop if train_conf.is_data_refresh: data_train, data_valid, data_test = prepare.prepare_data( data, model_conf.img_shape, model_conf.img_re_sample, model_conf.text_length, label_size, train_ratio, valid_ratio) (x_train, y_train), (x_valid, y_valid), (x_test, y_test) = data_train, data_valid, data_test if test_ratio > 0 and len(scores) > 0: # average score avg_scores = metrics.compute_mean_var(scores) metrics.display_average_scores(avg_scores, label_names, train_conf.repeat_times) # store average score if train_conf.is_log_avg_score: path_avg = os.path.join( train_conf.path_output, 'result_{}.avg.txt'.format(train_conf.code_name)) with codecs.open(path_avg, mode='w', encoding='UTF-8') as fo: metrics.display_average_scores(avg_scores, label_names, train_conf.repeat_times, is_k_print=True, fo=fo)
def save_on_condition(condition, path_out, content): if condition: save_pkl_data(path_out, content)