Exemplo n.º 1
0
            fw.write(" ".join(line_seg_result) + '\n')

            line_cnt += 1
            if line_cnt % 100 == 0:
                print(line_cnt)

    fw.close()
    print(line_cnt)

    return True


if __name__ == '__main__':
    data_dir = get_data_dir()
    dict_path = os.path.join(data_dir, "msr.dict")
    test_path = os.path.join(data_dir, "msr_test.utf8")
    method = "bimm"
    max_num_char = 6

    test_result_path = os.path.join(data_dir, "msr_test_" + method + ".utf8")

    word_dict = load_dictionary(dict_path=dict_path)
    print("Total number of words is: %d\n" % (len(word_dict)))

    seg_on_file(word_dict=word_dict,
                test_path=test_path,
                seg_path=test_result_path,
                method=method,
                max_num_char=max_num_char)

    print(test_result_path)
Exemplo n.º 2
0
def segmentation():
    vocab_size = 3954
    embedding_dim = 64
    rnn_units = 32
    pad_index = 0  # pad_index, to mask in loss
    rnn_steps = 30  # is not needed in test

    test_path = os.path.join(get_data_dir(), "msr_test.utf8")
    seg_path = os.path.join(get_data_dir(), "msr_test_birnn_crf.utf8")
    char2id_dict_path = os.path.join(get_data_dir(),
                                     "msr_training_char2id_dict.pkl")
    char2id_dict = load_dictionary(dict_path=char2id_dict_path)
    print("#char2id_dict=%d" % len(char2id_dict))

    # === Build and compile model.
    model = BiRNNCRF(vocab_size=vocab_size,
                     embedding_dim=embedding_dim,
                     rnn_units=rnn_units)
    optimizer = tf.keras.optimizers.Adam(0.001)

    crf = model.crf_layer
    model.compile(optimizer=optimizer, loss=crf.loss, metrics=[crf.accuracy])
    """
    model.compile(optimizer=optimizer,
                   loss=mask_sparse_cross_entropy,
                   metrics=['acc'])
    """

    # === Load weights.
    checkpoint_dir = os.path.join(get_model_dir(), "rnn_model")
    checkpoint = tf.train.latest_checkpoint(checkpoint_dir=checkpoint_dir)
    model.load_weights(checkpoint)

    # === Run once, to load weights of checkpoint.
    test_model_once(model=model, vocab_size=vocab_size)

    # Load separator_dict
    separator_dict = load_separator_dict()
    print("#separator_dict=%d" % len(separator_dict))

    fw = open(seg_path, 'w', encoding='utf-8')
    with open(test_path, 'r', encoding='utf-8') as f:

        line_cnt = 0
        for line in f:

            labels = model_predict(model=model,
                                   char_list=line[:-1],
                                   char2id_dict=char2id_dict,
                                   separator_dict=separator_dict)

            if len(line[:-1]) != len(labels):
                print("Wrong")
                print(line[:-1], '\n', labels)
                print(len(line[:-1]), len(labels))
                break

            # {0: pad, 1: B, 2: M, 3: E, 4: S}
            words = []
            word = []
            for i, label in zip(range(len(line) - 1), labels):
                word.append(line[i])
                if label == 3 or label == 4:
                    words.append("".join(word))
                    word = []
            if len(word) > 0:
                words.append("".join(word))
            fw.write(" ".join(words) + '\n')

            line_cnt += 1
            if line_cnt % 100 == 0:
                print(line_cnt)

        print(line_cnt)
    fw.close()