예제 #1
0
def get_article_entities(idx, text, ltp):
    """
    对每篇文章提取命名实体
    :param article:
    :param ltp:
    :return:
    """
    entities = []
    try:
        sents = split_sentence(text)
        seg, hidden = ltp.seg(sents)
        nertags = ltp.ner(hidden)
        # seg, hidden = ltps[idx%7].seg(sents)
        # nertags = ltps[idx%7].ner(hidden)
        # 用dict
        for idx1, tags_of_sent in enumerate(nertags):
            for tag in tags_of_sent:
                # print(tag)
                label, start, end = tag
                word = "".join(seg[idx1][start:end+1])
                if len(word) < 2:
                    continue
                # print(word)
                entities.append((word, label))
                # if word not in entities:
                #     entities[word] = t
        # print(entities)
    except Exception as e:
        print("ERROR get_article_entites: {}".format(e))
        # debug_logger.debug("ERROR get_article_entites: {}".format(e))
    return entities
예제 #2
0
파일: ltp.py 프로젝트: bill007bill/ltp
 def sent_split(self,
                inputs: List[str],
                flag: str = "all",
                limit: int = 510):
     inputs = [
         split_sentence(text, flag=flag, limit=limit) for text in inputs
     ]
     inputs = list(itertools.chain(*inputs))
     return inputs
예제 #3
0
def annotate_text(text):

    origin_sentences = split_sentence(text)
    debug_logger.debug("{}".format("\n".join(origin_sentences)))
    lemmas, hidden = nlp.segment(origin_sentences)
    words = nlp.postag(lemmas, hidden)
    words = nlp.nertag(words, hidden)  # add NER tag
    sentences = nlp.dependency(words, hidden)
    for sent in sentences:
        dp_graph(sent)

    return sentences
예제 #4
0
    def test_extract_rule(self):
        from ltp.utils import split_sentence
        from EventExploreServer.component.nlp_annotator.nlp import nlp
        debug_logger.setLevel(logging.DEBUG)
        text = '''
        哈德森出生在伦敦的郊区汉普斯特德。 
        美国总统奥巴马访问中国。
        美国总统特朗普对中国进行国事访问。 
        1927年3月11日,蒋介石、张静江、张群、黄山等人踏雪游览庐山风光。
        小鸟在水边寻食。
        '''
        origin_sentences = split_sentence(text)
        lemmas, hidden = nlp.segment(origin_sentences)
        print(lemmas)
        words_postag = nlp.postag(lemmas, hidden)
        words_nertag = nlp.nertag(words_postag, hidden)
        sentences = nlp.dependency(words_nertag, hidden)
        for sent in sentences:
            for word in sent.words:
                print(word.to_string())
            print(sent.get_name_entities())
        pass
        print("Test Extract Rule:")
        # save = True
        save = False
        from EventExploreServer.model import TripleUnit
        extract_rule(TripleUnit([sentences[0].words[0]], [sentences[0].words[1]], [sentences[0].words[6]]),
                     sentences[0], save)
        extract_rule(TripleUnit([sentences[1].words[0]], [sentences[1].words[1]], [sentences[1].words[2]]),
                     sentences[1], save)
        extract_rule(TripleUnit([sentences[1].words[2]], [sentences[1].words[3]], [sentences[1].words[4]]),
                     sentences[1], save)

        root = find_sub_structure_root([sentences[1].words[0], sentences[1].words[1], sentences[1].words[2]],
                                       sentences[1])
        print('Find root word: {}'.format(root))

        root = get_dp_root(sentences[3])
        p1 = dfs_root_to_target_path(root, sentences[3].words[4], sentences[3])
        for w in p1:
            print(w.lemma, end='-->')
        print()
        print("Test1: e1-蒋介石  rel-游览")
        get_shortest_path(sentences[3].words[4], sentences[3].words[15], sentences[3])
        print("Test2: e1-1927年  rel-踏雪")
        get_shortest_path(sentences[3].words[0], sentences[3].words[14], sentences[3])
        print("Test3: e1-踏雪  rel-1927年")
        get_shortest_path(sentences[3].words[14], sentences[3].words[0], sentences[3])
        print("Test4: e1-游览  rel-庐山")
        get_shortest_path(sentences[3].words[15], sentences[3].words[16], sentences[3])
        print("Test5: e1-庐山  rel-游览")
        get_shortest_path(sentences[3].words[16], sentences[3].words[15], sentences[3])
예제 #5
0
def extract_text(text, generalization=True):
    """
    对输入对文本进行关系抽取
    :param text:
    :return: Triple元素的二维list
    """
    origin_sentences = split_sentence(text)
    debug_logger.debug("{}".format("\n".join(origin_sentences)))
    lemmas, hidden = nlp.segment(origin_sentences)
    words = nlp.postag(lemmas, hidden)
    words = nlp.nertag(words, hidden)  # add NER tag
    sentences = nlp.dependency(words, hidden)

    triples = []
    for idx_sent, sent in enumerate(origin_sentences):
        triples_of_sent = extractor.extract(sent, sentences[idx_sent], idx_sent)
        triples.append(triples_of_sent)
    return triples
예제 #6
0
    def test_extract_by_rules(self):
        debug_logger.setLevel(logging.DEBUG)
        from ltp.utils import split_sentence
        from EventExploreServer.component import NLP
        from EventExploreServer.component.open_relation_extraction.extractor import Extractor1
        text = '''
                小明死于美国。 
                美国总统奥巴马访问中国。
                '''
        origin_sentences = split_sentence(text)
        nlp = NLP()
        lemmas, hidden = nlp.segment(origin_sentences)
        words_postag = nlp.postag(lemmas, hidden)
        words_nertag = nlp.nertag(words_postag, hidden)
        sentences = nlp.dependency(words_nertag, hidden)

        for i in range(len(origin_sentences)):
            extractor = Extractor1(origin_sentences[i], sentences[i])
            for entity_pair in extractor.entity_pairs:
                extract_by_rules(entity_pair.get_pair(), sentences[i])
예제 #7
0
 def test_split_sentence(self):
     text1 = '''
             习近平主席访问奥巴马总统。
            习近平主席访问奥巴马总统先生。习近平主席同志访问奥巴马总统先生。
            习近平主席视察厦门,李克强访问香港。
            习近平来我家了,我跑着去迎接。
            习近平视察并访问厦门。
            拉里佩奇和谢尔盖布林创建Google。
            马云创建了阿里巴巴、蚂蚁金服和淘宝。
            习近平主席同志和奥巴马总统先生一同访问非洲。
            张三访问非洲和慰问小明。
            张三访问和慰问小明。
            张三视察和访问巴西。
            张三访问并慰问小明。
            张三视察并访问小明。
            张三视察并访问巴西。
            张三视察并访问Google。
                     张三,李四和小明都去了美国。
                     小明去了美国,英国和法国。
             '''
     from ltp.utils import split_sentence
     for sent in split_sentence(text1):
         print(sent)