fw.write(" ".join(line_seg_result) + '\n') line_cnt += 1 if line_cnt % 100 == 0: print(line_cnt) fw.close() print(line_cnt) return True if __name__ == '__main__': data_dir = get_data_dir() dict_path = os.path.join(data_dir, "msr.dict") test_path = os.path.join(data_dir, "msr_test.utf8") method = "bimm" max_num_char = 6 test_result_path = os.path.join(data_dir, "msr_test_" + method + ".utf8") word_dict = load_dictionary(dict_path=dict_path) print("Total number of words is: %d\n" % (len(word_dict))) seg_on_file(word_dict=word_dict, test_path=test_path, seg_path=test_result_path, method=method, max_num_char=max_num_char) print(test_result_path)
def segmentation(): vocab_size = 3954 embedding_dim = 64 rnn_units = 32 pad_index = 0 # pad_index, to mask in loss rnn_steps = 30 # is not needed in test test_path = os.path.join(get_data_dir(), "msr_test.utf8") seg_path = os.path.join(get_data_dir(), "msr_test_birnn_crf.utf8") char2id_dict_path = os.path.join(get_data_dir(), "msr_training_char2id_dict.pkl") char2id_dict = load_dictionary(dict_path=char2id_dict_path) print("#char2id_dict=%d" % len(char2id_dict)) # === Build and compile model. model = BiRNNCRF(vocab_size=vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units) optimizer = tf.keras.optimizers.Adam(0.001) crf = model.crf_layer model.compile(optimizer=optimizer, loss=crf.loss, metrics=[crf.accuracy]) """ model.compile(optimizer=optimizer, loss=mask_sparse_cross_entropy, metrics=['acc']) """ # === Load weights. checkpoint_dir = os.path.join(get_model_dir(), "rnn_model") checkpoint = tf.train.latest_checkpoint(checkpoint_dir=checkpoint_dir) model.load_weights(checkpoint) # === Run once, to load weights of checkpoint. test_model_once(model=model, vocab_size=vocab_size) # Load separator_dict separator_dict = load_separator_dict() print("#separator_dict=%d" % len(separator_dict)) fw = open(seg_path, 'w', encoding='utf-8') with open(test_path, 'r', encoding='utf-8') as f: line_cnt = 0 for line in f: labels = model_predict(model=model, char_list=line[:-1], char2id_dict=char2id_dict, separator_dict=separator_dict) if len(line[:-1]) != len(labels): print("Wrong") print(line[:-1], '\n', labels) print(len(line[:-1]), len(labels)) break # {0: pad, 1: B, 2: M, 3: E, 4: S} words = [] word = [] for i, label in zip(range(len(line) - 1), labels): word.append(line[i]) if label == 3 or label == 4: words.append("".join(word)) word = [] if len(word) > 0: words.append("".join(word)) fw.write(" ".join(words) + '\n') line_cnt += 1 if line_cnt % 100 == 0: print(line_cnt) print(line_cnt) fw.close()