Exemplo n.º 1
0
def annotate_sentence(origin_sentence):
    lemmas, hidden = nlp.segment([origin_sentence])
    words = nlp.postag(lemmas, hidden)
    words = nlp.nertag(words, hidden)  # add NER tag
    sentence = nlp.dependency(words, hidden)
    dp_graph(sentence[0])
    return sentence[0]
Exemplo n.º 2
0
def annotate_text(text):

    origin_sentences = split_sentence(text)
    debug_logger.debug("{}".format("\n".join(origin_sentences)))
    lemmas, hidden = nlp.segment(origin_sentences)
    words = nlp.postag(lemmas, hidden)
    words = nlp.nertag(words, hidden)  # add NER tag
    sentences = nlp.dependency(words, hidden)
    for sent in sentences:
        dp_graph(sent)

    return sentences
Exemplo n.º 3
0
    def test_extract_rule(self):
        from ltp.utils import split_sentence
        from EventExploreServer.component.nlp_annotator.nlp import nlp
        debug_logger.setLevel(logging.DEBUG)
        text = '''
        哈德森出生在伦敦的郊区汉普斯特德。 
        美国总统奥巴马访问中国。
        美国总统特朗普对中国进行国事访问。 
        1927年3月11日,蒋介石、张静江、张群、黄山等人踏雪游览庐山风光。
        小鸟在水边寻食。
        '''
        origin_sentences = split_sentence(text)
        lemmas, hidden = nlp.segment(origin_sentences)
        print(lemmas)
        words_postag = nlp.postag(lemmas, hidden)
        words_nertag = nlp.nertag(words_postag, hidden)
        sentences = nlp.dependency(words_nertag, hidden)
        for sent in sentences:
            for word in sent.words:
                print(word.to_string())
            print(sent.get_name_entities())
        pass
        print("Test Extract Rule:")
        # save = True
        save = False
        from EventExploreServer.model import TripleUnit
        extract_rule(TripleUnit([sentences[0].words[0]], [sentences[0].words[1]], [sentences[0].words[6]]),
                     sentences[0], save)
        extract_rule(TripleUnit([sentences[1].words[0]], [sentences[1].words[1]], [sentences[1].words[2]]),
                     sentences[1], save)
        extract_rule(TripleUnit([sentences[1].words[2]], [sentences[1].words[3]], [sentences[1].words[4]]),
                     sentences[1], save)

        root = find_sub_structure_root([sentences[1].words[0], sentences[1].words[1], sentences[1].words[2]],
                                       sentences[1])
        print('Find root word: {}'.format(root))

        root = get_dp_root(sentences[3])
        p1 = dfs_root_to_target_path(root, sentences[3].words[4], sentences[3])
        for w in p1:
            print(w.lemma, end='-->')
        print()
        print("Test1: e1-蒋介石  rel-游览")
        get_shortest_path(sentences[3].words[4], sentences[3].words[15], sentences[3])
        print("Test2: e1-1927年  rel-踏雪")
        get_shortest_path(sentences[3].words[0], sentences[3].words[14], sentences[3])
        print("Test3: e1-踏雪  rel-1927年")
        get_shortest_path(sentences[3].words[14], sentences[3].words[0], sentences[3])
        print("Test4: e1-游览  rel-庐山")
        get_shortest_path(sentences[3].words[15], sentences[3].words[16], sentences[3])
        print("Test5: e1-庐山  rel-游览")
        get_shortest_path(sentences[3].words[16], sentences[3].words[15], sentences[3])
Exemplo n.º 4
0
def extract_text(text, generalization=True):
    """
    对输入对文本进行关系抽取
    :param text:
    :return: Triple元素的二维list
    """
    origin_sentences = split_sentence(text)
    debug_logger.debug("{}".format("\n".join(origin_sentences)))
    lemmas, hidden = nlp.segment(origin_sentences)
    words = nlp.postag(lemmas, hidden)
    words = nlp.nertag(words, hidden)  # add NER tag
    sentences = nlp.dependency(words, hidden)

    triples = []
    for idx_sent, sent in enumerate(origin_sentences):
        triples_of_sent = extractor.extract(sent, sentences[idx_sent], idx_sent)
        triples.append(triples_of_sent)
    return triples
Exemplo n.º 5
0
def extract_article(article, idx_document=0, generalization=False):
    """
    对一个ArticleES中的title和content进行关系元组抽取
    :param article: ArticleES对象
    :param generalization: 对抽取的元组泛化处理
    :return: Triple元素的二维list
    """

    origin_sentences = article.sentence_of_title + article.sentence_of_content
    lemmas, hidden = nlp.segment(origin_sentences)
    words_postag = nlp.postag(lemmas, hidden)
    words_nertag = nlp.nertag(words_postag, hidden)
    sentences = nlp.dependency(words_nertag, hidden)

    triples = []
    for idx_sent, sent in enumerate(origin_sentences):
        trace_logger.info("Sentence {}: {}".format(idx_sent+1, sent))
        # debug_logger.debug(sent)
        # for word in sentences[idx_sent].words:
        #     debug_logger.debug(word.to_string())
        triples_of_sent = extractor.extract(sent, sentences[idx_sent], idx_sent, idx_document)
        triples.append(triples_of_sent)
    for ts_of_sent in triples:
        for t in ts_of_sent:
            # debug_logger.debug(t.to_string())
            trace_logger.info(t.to_string())

    if not generalization:
        return triples

    trace_logger.info('Generalizing triples...')
    generalization_triples = []
    for triples_of_sent in triples:
        tmp = []
        for triple in triples_of_sent:
            tmp.extend(triple.gts)
        generalization_triples.append(tmp)
    return generalization_triples
Exemplo n.º 6
0
class NLPLTP:
    def __init__(self, default_model_dir=LTP4_MODEL_DIR):
        print(default_model_dir)
        self.ltp = LTP(path=default_model_dir)


if __name__ == '__main__':
    ltp = LTP(path=LTP4_MODEL_DIR)
    seg, hidden = ltp.seg(["他叫汤姆去拿外衣。", "他就读于复旦大学。", "吴秀波diss李明", "新京报记者吴婷婷"])
    pos = ltp.pos(hidden)
    ner = ltp.ner(hidden)
    dep = ltp.dep(hidden)
    srl = ltp.srl(hidden)
    sdp = ltp.sdp(hidden)

    print('seg: ', seg)
    print('hidden', hidden)
    print('')
    print('pos: ', pos)
    print('ner: ', ner)
    print('dep: ', dep)
    print('srl: ', srl)
    print('sdp: ', sdp)

    origin_sentences = ["他叫汤姆去拿外衣。", "他就读于复旦大学。"]
    lemmas, hidden = nlp.segment(origin_sentences)
    words_postag = nlp.postag(lemmas, hidden)
    words_nertag = nlp.nertag(words_postag, hidden)
    sentences = nlp.dependency(words_nertag, hidden)