def annotate_sentence(origin_sentence): lemmas, hidden = nlp.segment([origin_sentence]) words = nlp.postag(lemmas, hidden) words = nlp.nertag(words, hidden) # add NER tag sentence = nlp.dependency(words, hidden) dp_graph(sentence[0]) return sentence[0]
def annotate_text(text): origin_sentences = split_sentence(text) debug_logger.debug("{}".format("\n".join(origin_sentences))) lemmas, hidden = nlp.segment(origin_sentences) words = nlp.postag(lemmas, hidden) words = nlp.nertag(words, hidden) # add NER tag sentences = nlp.dependency(words, hidden) for sent in sentences: dp_graph(sent) return sentences
def test_extract_rule(self): from ltp.utils import split_sentence from EventExploreServer.component.nlp_annotator.nlp import nlp debug_logger.setLevel(logging.DEBUG) text = ''' 哈德森出生在伦敦的郊区汉普斯特德。 美国总统奥巴马访问中国。 美国总统特朗普对中国进行国事访问。 1927年3月11日,蒋介石、张静江、张群、黄山等人踏雪游览庐山风光。 小鸟在水边寻食。 ''' origin_sentences = split_sentence(text) lemmas, hidden = nlp.segment(origin_sentences) print(lemmas) words_postag = nlp.postag(lemmas, hidden) words_nertag = nlp.nertag(words_postag, hidden) sentences = nlp.dependency(words_nertag, hidden) for sent in sentences: for word in sent.words: print(word.to_string()) print(sent.get_name_entities()) pass print("Test Extract Rule:") # save = True save = False from EventExploreServer.model import TripleUnit extract_rule(TripleUnit([sentences[0].words[0]], [sentences[0].words[1]], [sentences[0].words[6]]), sentences[0], save) extract_rule(TripleUnit([sentences[1].words[0]], [sentences[1].words[1]], [sentences[1].words[2]]), sentences[1], save) extract_rule(TripleUnit([sentences[1].words[2]], [sentences[1].words[3]], [sentences[1].words[4]]), sentences[1], save) root = find_sub_structure_root([sentences[1].words[0], sentences[1].words[1], sentences[1].words[2]], sentences[1]) print('Find root word: {}'.format(root)) root = get_dp_root(sentences[3]) p1 = dfs_root_to_target_path(root, sentences[3].words[4], sentences[3]) for w in p1: print(w.lemma, end='-->') print() print("Test1: e1-蒋介石 rel-游览") get_shortest_path(sentences[3].words[4], sentences[3].words[15], sentences[3]) print("Test2: e1-1927年 rel-踏雪") get_shortest_path(sentences[3].words[0], sentences[3].words[14], sentences[3]) print("Test3: e1-踏雪 rel-1927年") get_shortest_path(sentences[3].words[14], sentences[3].words[0], sentences[3]) print("Test4: e1-游览 rel-庐山") get_shortest_path(sentences[3].words[15], sentences[3].words[16], sentences[3]) print("Test5: e1-庐山 rel-游览") get_shortest_path(sentences[3].words[16], sentences[3].words[15], sentences[3])
def extract_text(text, generalization=True): """ 对输入对文本进行关系抽取 :param text: :return: Triple元素的二维list """ origin_sentences = split_sentence(text) debug_logger.debug("{}".format("\n".join(origin_sentences))) lemmas, hidden = nlp.segment(origin_sentences) words = nlp.postag(lemmas, hidden) words = nlp.nertag(words, hidden) # add NER tag sentences = nlp.dependency(words, hidden) triples = [] for idx_sent, sent in enumerate(origin_sentences): triples_of_sent = extractor.extract(sent, sentences[idx_sent], idx_sent) triples.append(triples_of_sent) return triples
def extract_article(article, idx_document=0, generalization=False): """ 对一个ArticleES中的title和content进行关系元组抽取 :param article: ArticleES对象 :param generalization: 对抽取的元组泛化处理 :return: Triple元素的二维list """ origin_sentences = article.sentence_of_title + article.sentence_of_content lemmas, hidden = nlp.segment(origin_sentences) words_postag = nlp.postag(lemmas, hidden) words_nertag = nlp.nertag(words_postag, hidden) sentences = nlp.dependency(words_nertag, hidden) triples = [] for idx_sent, sent in enumerate(origin_sentences): trace_logger.info("Sentence {}: {}".format(idx_sent+1, sent)) # debug_logger.debug(sent) # for word in sentences[idx_sent].words: # debug_logger.debug(word.to_string()) triples_of_sent = extractor.extract(sent, sentences[idx_sent], idx_sent, idx_document) triples.append(triples_of_sent) for ts_of_sent in triples: for t in ts_of_sent: # debug_logger.debug(t.to_string()) trace_logger.info(t.to_string()) if not generalization: return triples trace_logger.info('Generalizing triples...') generalization_triples = [] for triples_of_sent in triples: tmp = [] for triple in triples_of_sent: tmp.extend(triple.gts) generalization_triples.append(tmp) return generalization_triples
class NLPLTP: def __init__(self, default_model_dir=LTP4_MODEL_DIR): print(default_model_dir) self.ltp = LTP(path=default_model_dir) if __name__ == '__main__': ltp = LTP(path=LTP4_MODEL_DIR) seg, hidden = ltp.seg(["他叫汤姆去拿外衣。", "他就读于复旦大学。", "吴秀波diss李明", "新京报记者吴婷婷"]) pos = ltp.pos(hidden) ner = ltp.ner(hidden) dep = ltp.dep(hidden) srl = ltp.srl(hidden) sdp = ltp.sdp(hidden) print('seg: ', seg) print('hidden', hidden) print('') print('pos: ', pos) print('ner: ', ner) print('dep: ', dep) print('srl: ', srl) print('sdp: ', sdp) origin_sentences = ["他叫汤姆去拿外衣。", "他就读于复旦大学。"] lemmas, hidden = nlp.segment(origin_sentences) words_postag = nlp.postag(lemmas, hidden) words_nertag = nlp.nertag(words_postag, hidden) sentences = nlp.dependency(words_nertag, hidden)