Пример #1
0
def execute_with_conf(conf):
    # load program analysis results, format: app, image, layout, permissions
    print('loading program analysis results')
    data_pa = load_data(conf.path_pa)

    # extract drawable images
    print('extracting drawable images')
    log_level = check_conf(conf.log_level, {0, 1, 2}, 0)
    drawable_images = extract_drawable_images(data_pa, conf.path_app, conf.log_level)

    # extract texts, format: layout_texts, embedded_texts, resource_texts
    print('extracting texts')
    search_range = check_conf(conf.layout_text_range, {'parent', 'total'}, 'parent')
    enable_ocr_cache = check_conf(conf.enable_ocr_cache, {True, False}, True)
    is_translate = check_conf(conf.enable_translate, {True, False}, True)
    enable_translate_cache = check_conf(conf.enable_translate_cache, {True, False}, True)
    texts = extract_contextual_texts(data_pa, drawable_images, conf.path_app, conf.path_east,
                                     search_range,
                                     (conf.ocr_width, conf.ocr_height), conf.ocr_padding, enable_ocr_cache,
                                     is_translate, enable_translate_cache,
                                     log_level)

    # merge and save the triple, <image, texts, permissions>
    print('finished and save')
    assert len(data_pa) == len(drawable_images) == len(texts)
    # format: [(compressed_img), [[layout_texts], [embedded_texts], [res_texts]], {permissions}]
    data = [[drawable_images[i]] + [texts[i]] + [data_pa[i][-1]] for i in range(len(data_pa))]
    save_pkl_data(conf.path_save, data)
Пример #2
0
def predict(predict_conf):
    # load data
    _, data = load_pkl_data(predict_conf.path_data)

    # load model meta data
    meta = load_pkl_data(predict_conf.path_meta)
    meta_image_shape = meta['ModelConf'].img_shape
    meta_re_sample_type = meta['ModelConf'].img_re_sample
    meta_text_len = meta['ModelConf'].text_length
    meta_label_num = len(meta['label2id'])
    meta_id2label = {v: k for k, v in meta['label2id'].items()}

    # load model
    model = keras.models.load_model(predict_conf.path_model, custom_objects={
        "CoAttentionParallel": CoAttentionParallel
    })

    # prepare data
    _, _, data_test = prepare_data(data, meta_image_shape, meta_re_sample_type,
                                   meta_text_len, meta_label_num, 0, 0)

    # predict with trained model
    x_test, y_test = data_test
    y_predict = model.predict(x_test)
    y_true = y_test.tolist()

    # save predictions
    save_pkl_data(predict_conf.path_predictions, [y_predict, y_test])

    # print metric results
    scores = evaluate(y_true, y_predict, predict_conf.threshold)
    label_names = [meta_id2label[i] for i in range(len(meta_id2label))]
    display_scores(scores, label_names)
Пример #3
0
def pre_process_save_results(path_out, data, v2id, l2id, wid_name_list):
    if debug == True:
        print("v2id")
        print(v2id)
        print("l2id")
        print(l2id)
        print(len(data), len(v2id), len(l2id))
    save_pkl_data(path_out, [(v2id, l2id), data, wid_name_list])
Пример #4
0
def pre_process_save_results(path_out, data, v2id, l2id):
    save_pkl_data(path_out, [(v2id, l2id), data])
Пример #5
0
def train(model_conf, train_conf):
    # set up random seed
    random.seed(train_conf.random_seed)

    # check the output path
    if not os.path.exists(train_conf.path_output):
        os.makedirs(train_conf.path_output)

    # load and statistics
    (vocab2id, label2id), data = load_pkl_data(train_conf.path_data)
    id2vocab = {v: k for k, v in vocab2id.items()}
    id2label = {v: k for k, v in label2id.items()}
    token_size, label_size = len(id2vocab), len(id2label)
    label_names = [id2label[i] for i in range(len(id2label))]
    print('label size:', label_size, 'token size:', token_size)
    print('label names:', label_names)

    # split data
    train_ratio, valid_ratio, test_ratio = normalize_data_ratio(
        train_conf.train_ratio, train_conf.valid_ratio, train_conf.test_ratio)
    data_train, data_valid, data_test = prepare.prepare_data(
        data, model_conf.img_shape, model_conf.img_re_sample,
        model_conf.text_length, label_size, train_ratio, valid_ratio)
    (x_train, y_train), (x_valid,
                         y_valid), (x_test,
                                    y_test) = data_train, data_valid, data_test
    print('train: {0}; valid: {1}; test: {2}'.format(len(y_train),
                                                     len(y_valid),
                                                     len(y_test)))

    # train and test
    scores = []
    predict_threshold = 0.5
    for i in range(train_conf.repeat_times):
        print('{sp}\ntime {i}\n{sp}'.format(sp='=' * 20, i=i))
        # prefix to save the training process
        path_prefix = os.path.join(
            train_conf.path_output,
            'model_{}_{}'.format(train_conf.code_name, i))

        # create and train the model
        model = create_model_with_conf(token_size, label_size, model_conf)
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])

        # init callbacks
        path_cp = path_prefix + '.cp'
        es = EarlyStopping(monitor=train_conf.monitor_type,
                           patience=train_conf.early_stop_patients)
        cp = ModelCheckpoint(filepath=path_cp,
                             monitor=train_conf.monitor_type,
                             save_best_only=True)

        # fit the model
        history = model.fit(x_train,
                            y_train,
                            batch_size=train_conf.batch_size,
                            epochs=train_conf.epochs,
                            verbose=train_conf.verbose,
                            validation_data=(x_valid, y_valid),
                            callbacks=[cp, es])

        # save training history
        save_on_condition(train_conf.is_log_history, path_prefix + '.his',
                          history.history)
        # save the trained model
        model.save(path_prefix + '.h5')
        # save the training meta data, e.g., TrainConf, vocab2id, label2id
        save_pkl_data(path_prefix + '.meta', {
            'ModelConf': model_conf,
            'vocab2id': vocab2id,
            'label2id': label2id
        })

        # test if test_ratio > 0
        if test_ratio > 0:
            # predict with trained model
            model.load_weights(path_cp)
            y_predict = model.predict(x_test)
            y_true = y_test.tolist()

            # save prediction
            if train_conf.is_log_prediction:
                path_predict = path_prefix + '.predictions'
                save_pkl_data(path_predict, [y_predict, y_test])

            # evaluate
            scores_current = metrics.evaluate(y_true, y_predict,
                                              predict_threshold)
            metrics.display_scores(scores_current, label_names)
            scores.append(scores_current)

        # prepare for the next loop
        if train_conf.is_data_refresh:
            data_train, data_valid, data_test = prepare.prepare_data(
                data, model_conf.img_shape, model_conf.img_re_sample,
                model_conf.text_length, label_size, train_ratio, valid_ratio)
            (x_train,
             y_train), (x_valid,
                        y_valid), (x_test,
                                   y_test) = data_train, data_valid, data_test

    if test_ratio > 0 and len(scores) > 0:
        # average score
        avg_scores = metrics.compute_mean_var(scores)
        metrics.display_average_scores(avg_scores, label_names,
                                       train_conf.repeat_times)

        # store average score
        if train_conf.is_log_avg_score:
            path_avg = os.path.join(
                train_conf.path_output,
                'result_{}.avg.txt'.format(train_conf.code_name))
            with codecs.open(path_avg, mode='w', encoding='UTF-8') as fo:
                metrics.display_average_scores(avg_scores,
                                               label_names,
                                               train_conf.repeat_times,
                                               is_k_print=True,
                                               fo=fo)
Пример #6
0
def save_on_condition(condition, path_out, content):
    if condition:
        save_pkl_data(path_out, content)