def get_article_entities(idx, text, ltp): """ 对每篇文章提取命名实体 :param article: :param ltp: :return: """ entities = [] try: sents = split_sentence(text) seg, hidden = ltp.seg(sents) nertags = ltp.ner(hidden) # seg, hidden = ltps[idx%7].seg(sents) # nertags = ltps[idx%7].ner(hidden) # 用dict for idx1, tags_of_sent in enumerate(nertags): for tag in tags_of_sent: # print(tag) label, start, end = tag word = "".join(seg[idx1][start:end+1]) if len(word) < 2: continue # print(word) entities.append((word, label)) # if word not in entities: # entities[word] = t # print(entities) except Exception as e: print("ERROR get_article_entites: {}".format(e)) # debug_logger.debug("ERROR get_article_entites: {}".format(e)) return entities
def sent_split(self, inputs: List[str], flag: str = "all", limit: int = 510): inputs = [ split_sentence(text, flag=flag, limit=limit) for text in inputs ] inputs = list(itertools.chain(*inputs)) return inputs
def annotate_text(text): origin_sentences = split_sentence(text) debug_logger.debug("{}".format("\n".join(origin_sentences))) lemmas, hidden = nlp.segment(origin_sentences) words = nlp.postag(lemmas, hidden) words = nlp.nertag(words, hidden) # add NER tag sentences = nlp.dependency(words, hidden) for sent in sentences: dp_graph(sent) return sentences
def test_extract_rule(self): from ltp.utils import split_sentence from EventExploreServer.component.nlp_annotator.nlp import nlp debug_logger.setLevel(logging.DEBUG) text = ''' 哈德森出生在伦敦的郊区汉普斯特德。 美国总统奥巴马访问中国。 美国总统特朗普对中国进行国事访问。 1927年3月11日,蒋介石、张静江、张群、黄山等人踏雪游览庐山风光。 小鸟在水边寻食。 ''' origin_sentences = split_sentence(text) lemmas, hidden = nlp.segment(origin_sentences) print(lemmas) words_postag = nlp.postag(lemmas, hidden) words_nertag = nlp.nertag(words_postag, hidden) sentences = nlp.dependency(words_nertag, hidden) for sent in sentences: for word in sent.words: print(word.to_string()) print(sent.get_name_entities()) pass print("Test Extract Rule:") # save = True save = False from EventExploreServer.model import TripleUnit extract_rule(TripleUnit([sentences[0].words[0]], [sentences[0].words[1]], [sentences[0].words[6]]), sentences[0], save) extract_rule(TripleUnit([sentences[1].words[0]], [sentences[1].words[1]], [sentences[1].words[2]]), sentences[1], save) extract_rule(TripleUnit([sentences[1].words[2]], [sentences[1].words[3]], [sentences[1].words[4]]), sentences[1], save) root = find_sub_structure_root([sentences[1].words[0], sentences[1].words[1], sentences[1].words[2]], sentences[1]) print('Find root word: {}'.format(root)) root = get_dp_root(sentences[3]) p1 = dfs_root_to_target_path(root, sentences[3].words[4], sentences[3]) for w in p1: print(w.lemma, end='-->') print() print("Test1: e1-蒋介石 rel-游览") get_shortest_path(sentences[3].words[4], sentences[3].words[15], sentences[3]) print("Test2: e1-1927年 rel-踏雪") get_shortest_path(sentences[3].words[0], sentences[3].words[14], sentences[3]) print("Test3: e1-踏雪 rel-1927年") get_shortest_path(sentences[3].words[14], sentences[3].words[0], sentences[3]) print("Test4: e1-游览 rel-庐山") get_shortest_path(sentences[3].words[15], sentences[3].words[16], sentences[3]) print("Test5: e1-庐山 rel-游览") get_shortest_path(sentences[3].words[16], sentences[3].words[15], sentences[3])
def extract_text(text, generalization=True): """ 对输入对文本进行关系抽取 :param text: :return: Triple元素的二维list """ origin_sentences = split_sentence(text) debug_logger.debug("{}".format("\n".join(origin_sentences))) lemmas, hidden = nlp.segment(origin_sentences) words = nlp.postag(lemmas, hidden) words = nlp.nertag(words, hidden) # add NER tag sentences = nlp.dependency(words, hidden) triples = [] for idx_sent, sent in enumerate(origin_sentences): triples_of_sent = extractor.extract(sent, sentences[idx_sent], idx_sent) triples.append(triples_of_sent) return triples
def test_extract_by_rules(self): debug_logger.setLevel(logging.DEBUG) from ltp.utils import split_sentence from EventExploreServer.component import NLP from EventExploreServer.component.open_relation_extraction.extractor import Extractor1 text = ''' 小明死于美国。 美国总统奥巴马访问中国。 ''' origin_sentences = split_sentence(text) nlp = NLP() lemmas, hidden = nlp.segment(origin_sentences) words_postag = nlp.postag(lemmas, hidden) words_nertag = nlp.nertag(words_postag, hidden) sentences = nlp.dependency(words_nertag, hidden) for i in range(len(origin_sentences)): extractor = Extractor1(origin_sentences[i], sentences[i]) for entity_pair in extractor.entity_pairs: extract_by_rules(entity_pair.get_pair(), sentences[i])
def test_split_sentence(self): text1 = ''' 习近平主席访问奥巴马总统。 习近平主席访问奥巴马总统先生。习近平主席同志访问奥巴马总统先生。 习近平主席视察厦门,李克强访问香港。 习近平来我家了,我跑着去迎接。 习近平视察并访问厦门。 拉里佩奇和谢尔盖布林创建Google。 马云创建了阿里巴巴、蚂蚁金服和淘宝。 习近平主席同志和奥巴马总统先生一同访问非洲。 张三访问非洲和慰问小明。 张三访问和慰问小明。 张三视察和访问巴西。 张三访问并慰问小明。 张三视察并访问小明。 张三视察并访问巴西。 张三视察并访问Google。 张三,李四和小明都去了美国。 小明去了美国,英国和法国。 ''' from ltp.utils import split_sentence for sent in split_sentence(text1): print(sent)