start_char, end_char = ent_arrays[0] label_ent_array.append((start_char, end_char + 1, l)) ents.append(label_ent_array[0]) if True == diff_contain_overlapping(ents): i = i + 1 doc = nlp(text) tags = biluo_tags_from_offsets(doc, ents) doc.ents = spans_from_biluo_tags(doc, tags) line = docs_to_json([doc]) f.write(json_dumps(line) + "\n") msg.good(f"Finished {file_path} :: {i} rows") if print_label: msg.info(f"{labels}") if __name__ == "__main__": # Chinese.Defaults.use_jieba = True nlp = Chinese() nlp.add_pipe(nlp.create_pipe('sentencizer')) dev_data = read_jsonl(Path("./cluener2020/dev.json")) train_data = read_jsonl(Path("./cluener2020/train.json")) format_data_to_jsonl(dev_data, Path("./clue_spacy_dev.jsonl")) format_data_to_jsonl(train_data, Path("./clue_spacy_train.jsonl"), print_label=True)
with open("exercises/zh/capitals.json", encoding="utf8") as f: CAPITALS = json.loads(f.read()) nlp = Chinese() matcher = PhraseMatcher(nlp.vocab) matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES))) def countries_component(doc): # 对所有匹配结果创建一个标签为"GPE"的实体Span matches = matcher(doc) doc.ents = [ Span(doc, start, end, label="GPE") for match_id, start, end in matches ] return doc # 把这个组件加入到流程中 nlp.add_pipe(countries_component) print(nlp.pipe_names) # 取值器,在国家首都的字典中寻找span的文本 get_capital = lambda span: CAPITALS.get(span.text) # 用这个取值器注册Span的扩展属性"capital" Span.set_extension("capital", getter=get_capital, force=True) # 处理文本,打印实体文本、标签和首都属性 doc = nlp("新加坡可能会和马来西亚一起建造高铁。") print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])