Пример #1
0
def ConfirmMuseum(text, museum, textid):
    # nlp = BosonNLP('SeJUopMY.24669.6kCKU4ruI3ss')
    # nlp = BosonNLP('lMdMTyuV.24544.0VHv6klp6Pk6')
    nlp = BosonNLP('sjWBhf9i.24699.rQmsCad9c3Jv')
    try:
        flag = 0
        text = text[0:1000]
        result = nlp.ner(text)[0]
        words = result['word']
        entities = result['entity']
        for entitie in entities:
            if entitie[2] == 'org_name':
                org_name = ''.join(words[entitie[0]:entitie[1]])
                if museum in org_name:
                    flag = 1
                    break
            elif entitie[2] == 'location':
                location = ''.join(words[entitie[0]: entitie[1]])
                if museum in location:
                    flag = 1
                    break
        if flag:
            print('Confirm!')
            return 1
        else:
            print('Not!')
            return 0
    except KeyError as e:
        print('exit in %s' % textid)
        print(e)
Пример #2
0
def Entity_extraction(text):
    nlp = BosonNLP("x-gOGutn.27554.G6_6QvdJafES")
    rest = nlp.ner(text)[0]
    print(rest)
    words = rest['word']
    entities = rest['entity']
    tags = rest['tag']
    for entity in entities:
        print(" ".join(words[entity[0]:entity[1]]), entity[2])
Пример #3
0
class QueryParser(object):
    def __init__(self):
        self.nlp = BosonNLP(bosonnlp_token)

    def parse(self, query_string):
        """
        input:
        7月22号 北京到上海的高铁票
        output:
        [{'entity': [[0, 3, 'time'], [3, 4, 'location'], [5, 6, 'location']], # 需要理解实体出现的模式,这块需要理解上下文
        'tag': ['t', 'm', 'q', 'ns', 'p', 'ns', 'ude', 'n', 'n'],
         'word': ['7月', '22', '号', '北京', '到', '上海', '的', '高铁', '票']}]
        """
        result = self.nlp.ner(query_string)[0]
        words = result['word']
        tags = result['tag']
        entities = result['entity']
        return (words, entities, tags)

    def get_entity(self, parsed_words, index_tuple):
        """
        获取已识别的实体
        采用filter
        参考 python cookbook部分

        input:
            entities : 二元组
            parsed_words : 解析好的词组
        """
        return parsed_words[index_tuple[0]:index_tuple[1]]

    def format_entities(self, entities):
        """
        给元组命名
        """
        namedentity = collections.namedtuple('namedentity', 'index_begin index_end entity_name')
        return [namedentity(entity[0], entity[1], entity[2]) for entity in entities]

    def get_format_time(self, time_entity):
        """
        output
        {'timestamp': '2013-02-28 16:30:29', 'type': 'timestamp'}
        """
        basetime = datetime.datetime.today()
        result = self.nlp.convert_time(
            time_entity,
            basetime)
        # print(result)
        timestamp = result["timestamp"]
        return timestamp.split(" ")[0]
Пример #4
0
def bosonnlpNER(news):
    from bosonnlp import BosonNLP
    nlp = BosonNLP('cKWUytiR.34676.f5F2YbS_EyX2')
    ner = nlp.ner(news)[0]
    print(ner)
    words = ner['word']
    entity = ner['entity']
    N = []
    # record the entity start and end. k:v = start : end
    entity_start = {}
    for e in entity:
        if e[2] in {'org_name', 'person_name'}:
            entity_start[e[0]] = e[1]
            N.append([''.join(words[e[0]:e[1]]), e[2]])
    return N, entity_start, words
Пример #5
0
class QueryParser(object):
    def __init__(self):
        self.nlp = BosonNLP(bosonnlp_token)
    def parse(self, query_string):
        """
        input:
        7月22号 北京到上海的高铁票
        output:
        [{'entity': [[0, 3, 'time'], [3, 4, 'location'], [5, 6, 'location']], # 需要理解实体出现的模式,这块需要理解上下文
        'tag': ['t', 'm', 'q', 'ns', 'p', 'ns', 'ude', 'n', 'n'],
         'word': ['7月', '22', '号', '北京', '到', '上海', '的', '高铁', '票']}]
        """
        result = self.nlp.ner(query_string)[0]
        words = result['word']
        tags = result['tag']
        entities = result['entity']
        return (words,entities,tags)
    def get_entity(self,parsed_words,index_tuple):
        """
        获取已识别的实体
        采用filter
        参考 python cookbook部分

        input:
            entities : 二元组
            parsed_words : 解析好的词组
        """
        return parsed_words[index_tuple[0]:index_tuple[1]]

    def format_entities(self,entities):
        """
        给元组命名
        """
        namedentity = collections.namedtuple('namedentity','index_begin index_end entity_name')
        return [namedentity(entity[0],entity[1],entity[2]) for entity in entities]

    def get_format_time(self,time_entity):
        """
        output
        {'timestamp': '2013-02-28 16:30:29', 'type': 'timestamp'}
        """
        basetime = datetime.datetime.today()
        result = self.nlp.convert_time(
            time_entity,
            basetime)
        #print(result)
        timestamp = result["timestamp"]
        return timestamp.split(" ")[0]
Пример #6
0
def getAnswerEntities(text_set, api_key, level):
	def f(x):
		return {
			'0': 'location',
			'1': 'person_name',
			'2': 'product_name',
			'3': ('org_name', 'company_name'),
			'4': ('product_name', 'org_name', 'company_name'),
		}[str(x)]
	nlp = BosonNLP(api_key)
	result = nlp.ner(text_set)[0]
	words = result['word']
	entities = result['entity']
	ul = []
	for entity in entities:
		if (entity[2] == f(level) or entity[2] in f(level)):
			ul.append(''.join(words[entity[0]:entity[1]]))
	keys = sortList(ul)
	return keys
Пример #7
0
def getAnswerEntities(text_set, api_key, level):
    def f(x):
        return {
            '0': 'location',
            '1': 'person_name',
            '2': 'product_name',
            '3': ('org_name', 'company_name'),
            '4': ('product_name', 'org_name', 'company_name'),
        }[str(x)]

    nlp = BosonNLP(api_key)
    result = nlp.ner(text_set)[0]
    words = result['word']
    entities = result['entity']
    ul = []
    for entity in entities:
        if (entity[2] == f(level) or entity[2] in f(level)):
            ul.append(''.join(words[entity[0]:entity[1]]))
    keys = sortList(ul)
    return keys
Пример #8
0
class NERProcess(multiprocessing.Process):
    def __init__(self, nername, phrase_list, groupid=0):
        multiprocessing.Process.__init__(self)
        self.nername = nername
        self.phrase_list = phrase_list
        self.numofphrase = len(phrase_list)

        # batch ID, and will be used for file name
        self.group_id = str(groupid)

        # load NER modules
        self.boson_ner = BosonNLP("bJ0hvqpK.21947.dpf19nyJfNHp")
        #self.conn = self.boson_ner.connect()
        #self.ltp_ner = LTPNer()

        self.jsonData = {}

        print "creating subprocess : " + self.nername + ":" + self.group_id + ", number of phrase: " + str(
            self.numofphrase)

    def run(self):
        print "subprocess " + self.nername + ":" + self.group_id + " started @ " + time.ctime(
        )

        jsonList = []

        for iter in range(self.numofphrase):
            raw_text = self.phrase_list[iter].text
            raw_text = raw_text.encode("utf-8", "error")

            document_id = str(self.phrase_list[iter].document_id)
            phrase_id = str(self.phrase_list[iter].id)

            boson_json = self.boson_ner.ner(raw_text)

            with open("./boson_result/" + document_id, "w") as f:
                json.dump(boson_json, f, ensure_ascii=False)

        print "subprocess " + self.nername + ":" + self.group_id + " ended @ " + time.ctime(
        )
Пример #9
0
def boson_nre_testCase():
    nlp=BosonNLP(API_TOKEN)
    os.chdir(TXT_PATH)
    fd=codecs.open("1.txt",'r',encoding='utf-8',errors="ignore",buffering=1)
    docText=fd.read()
    result=nlp.ner(docText)[0]
    words=result['word']
#     tags=result['tag']
    entities=result['entity']
#     print len(words)," ",len(tags)," ",len(entities)
#     print words
#     print "\n"
#     print tags
#     print "\n"
#     print len(entities)
#     print entities
#     print "\n"
    
    docEntityList=[]
    for i in range(len(entities)):
        mergedToken=u""
        for j in range(entities[i][0],entities[i][1],1):
            mergedToken=mergedToken+words[j]
        token_candidate=(mergedToken,str(entities[i][2]))
        
        #check for redundant token, only add new token into the entity_list
        flag=0
        for k in range(len(docEntityList)):
            if (token_candidate[0]==docEntityList[k][0]) and \
            (token_candidate[1]==docEntityList[k][1]):
                flag=flag+1        
        if flag==0:
            docEntityList.append(token_candidate)
            
#     return docEntityList
    print "=============="        
    print len(docEntityList)
    for i in range(len(docEntityList)):
        print docEntityList[i][0],"    ","type:",docEntityList[i][1]
Пример #10
0
def boson_nre_batch():
    os.chdir("E:\\wordDir")
    fd_out=codecs.open("EntityList.txt",'a',encoding="utf-8",errors='ignore',buffering=1)
    nlp=BosonNLP(API_TOKEN)
    entityCollection=[]
    os.chdir(TXT_PATH)
    for files in os.listdir(os.getcwd()):
        fd=codecs.open(files,"r",encoding="utf-8")
        docText=fd.read()
        result=nlp.ner(docText)[0]
        words=result['word']
    #     tags=result['tag']
        entities=result['entity']
        docEntityList=[]
        for i in range(len(entities)):
            mergedToken=u""
            for j in range(entities[i][0],entities[i][1],1):
                mergedToken=mergedToken+words[j]
            token_candidate=(mergedToken,str(entities[i][2]),files.decode('gb2312'))
            
            #check for redundant token, only add new token into the entity_list
            flag=0
            for k in range(len(docEntityList)):
                if (token_candidate[0]==docEntityList[k][0]) and \
                (token_candidate[1]==docEntityList[k][1]):
                    flag=flag+1        
            if flag==0:
                docEntityList.append(token_candidate)
        print files.decode('gb2312')," ",len(docEntityList)
        entityCollection.append(docEntityList)
                

    for i in range(len(entityCollection)):
        for j in range(len(entityCollection[i])):
            outputStr=entityCollection[i][j][0]+"||"+entityCollection[i][j][1]+"||"+entityCollection[i][j][2]+"\n"
            fd_out.write(outputStr)
    fd_out.close()
Пример #11
0
# person_name -> PER
# org_name, company_name, product_name -> ORG

from bosonnlp import BosonNLP
import os

api = "avzp5h2G.21940.kBiq3cew8Oct"
#nlp = BosonNLP(os.environ[api])
nlp = BosonNLP(api)

Input_file = './data/original.txt'
Output_file = './data/output_news.txt'
# 得到分句,返回的是句子组成的列表
Input_file = open(Input_file, 'r', encoding='utf-8').read()

entity, tag, word = nlp.ner(Input_file)
LOC_type = ['location']
PER_type = ['person_name']
ORG_type = ['org_name', 'company_name', 'product_name']

with open(Output_file, 'w', encoding='utf-8') as f:
    lastindex = 0
    for s, t, entity_type in entity:
        entity_name = []
        if s > lastindex:
            entity_name = word[lastindex:s]
            entity_name = ''.join(entity_name)
            f.write(entity_name + ' ' + 'O')
            f.write('\n')
        lastindex = t
Пример #12
0
def Entity_extraction(sentence):  #文本实体分析
    nlp = BosonNLP('TPDuivpZ.27572.rVuPCI9-kUlN')
    result = nlp.ner(sentence)
    List = []
    for i in range(len(result[0]['entity'])):
        a = []
        a.append(result[0]['word'][result[0]['entity'][i][0]])
        a.append(result[0]['entity'][i][2])
        List.append(a)
    location_list = []  #地名
    time_list = []  #时间
    person_name = []  #人名
    job_list = []  #工作
    for i in range(len(List)):
        if List[i][1] == 'location':
            location_list.append(List[i][0])
        if List[i][1] == 'time':
            time_list.append(List[i][0])
        if List[i][1] == 'person_name':
            person_name.append(List[i][0])
        if List[i][1] == 'job_title':
            job_list.append(List[i][0])
    location_list = list(set(location_list))
    time_list = list(set(time_list))
    person_name = list(set(person_name))
    job_list = list(set(job_list))
    location = {}
    location['地名'] = location_list
    time = {}
    time['时间'] = time_list
    person = {}
    person['人名'] = person_name
    job = {}
    job['工作'] = job_list
    print('地名:{}\n\n时间:{}\n\n人名:{}\n\n工作:{}'.format(location['地名'], time['时间'],
                                                    person['人名'], job['工作']))
    # 绘制文本实体分析
    plot.rcParams['font.sans-serif'] = ['SimHei']
    plot.rcParams['axes.unicode_minus'] = False
    DG = nx.DiGraph()
    plot.figure(figsize=(8, 8))
    plot.subplot(1, 1, 1)
    DG.add_nodes_from(['文本', '地名', '时间', '人名', '工作'])
    DG.add_edges_from([('文本', '地名'), ('文本', '人名'), ('文本', '工作'), ('文本', '时间')])
    location_next = location['地名']
    DG.add_nodes_from(location_next)
    for i in range(len(location_next)):
        DG.add_edge('地名', location_next[i])
    time_next = time['时间']
    DG.add_nodes_from(time_next)
    for i in range(len(time_next)):
        DG.add_edge('时间', time_next[i])
    person_next = person['人名']
    DG.add_nodes_from(person_next)
    for i in range(len(person_next)):
        DG.add_edge('人名', person_next[i])
    job_next = job['工作']
    DG.add_nodes_from(job_next)
    for i in range(len(job_next)):
        DG.add_edge('工作', job_next[i])
    colors = ['red', 'deepskyblue', 'magenta', 'limegreen', 'dimgrey']
    for i in range(len(location_next)):
        colors.append('lightblue')
    for i in range(len(time_next)):
        colors.append('plum')
    for i in range(len(person_next)):
        colors.append('lightgreen')
    for i in range(len(job_next)):
        colors.append('darkgray')
    nx.draw(DG, with_labels=True, node_size=700, node_color=colors)
    plot.title('文本实体分析', color='red', fontsize=20)
    plot.show()
Пример #13
0
# -*- encoding: utf-8 -*-
from __future__ import print_function, unicode_literals

from bosonnlp import BosonNLP
# 测试Boson实体识别API
# 注意:在测试时请更换为您的API token
nlp = BosonNLP('Reg0KvHM.17970.YFwdM3sID8xa')
test = open("地标.txt")
try:
    correct = 0
    list_of_test = []
    for line in test.readlines()[100:104]:
        line = line.strip('\n')
        list_of_test.append(line)
    for entity in list_of_test:
        result = nlp.ner(entity)[0]
        words = result['word']
        entities = result['entity']
        for entity in entities:
            if entity[2] == 'location' or entity[2] == 'org_name' or entity[
                    2] == 'company_name':
                correct = correct + 1
            else:
                print(''.join(words[entity[0]:entity[1]]), entity[2])
    precision = correct / len(list_of_test)
    print(correct, precision)
finally:
    test.close()
Пример #14
0
mytoken = 'wHXup4Wh.13586.tkz9YxHxkO_o'
nlp = BosonNLP(mytoken)

#文本初始化
i = 77
id = rlaq_u2.loc[i, 'id']
ajbh = rlaq_u2.loc[i, 'ajbh']
fssj = pd.to_datetime(rlaq_u2.loc[i, 'time'])
txt = rlaq_u2.loc[i, 'jyaq']
txt0 = txt
place = rlaq_u2.loc[i, 'place']
print(txt0)

#提取时间、地点、人物
result = nlp.ner(txt)[0]
words = result['word']
entities = result['entity']
Btime = []
Bplace = []
Bpeople = []
for entity in entities:
    if entity[2] == 'time':
        Btime.append(''.join(words[entity[0]:entity[1]]))
    if entity[2] == 'location':
        Bplace.append(''.join(words[entity[0]:entity[1]]))
    if entity[2] == 'person_name':
        Bpeople.append(''.join(words[entity[0]:entity[1]]))
print('时间:', ' | '.join(Btime))
print('地点:', ' | '.join(Bplace))
print('人物:', ' | '.join(Bpeople))
Пример #15
0
# -*- encoding: utf-8 -*-
from bosonnlp import BosonNLP
import os

#reference from http://bosonnlp-py.readthedocs.io/#bosonnlp-py

nlp = BosonNLP('bosonnlp的API')
# or nlp = BosonNLP(os.environ['BOSON_API_TOKEN'])
nlp.ner('你好啊', sensitivity=2)
nlp.ner(['成都商报记者 姚永忠', '微软XP操作系统今日正式退休'])
result = nlp.tag('成都商报记者 姚永忠')
format_tag_result(result[0])
result = nlp.tag(['亚投行意向创始成员国确定为57个', '“流量贵”频被吐槽'], oov_level=0)
result = nlp.tag("成都商报记者 姚永忠", space_mode=2)
Пример #16
0
    cnt = 0
    flag = 0
    for line in Input_file:
        cnt += 1
        print(len(Input_file), cnt)
        if index < 78:
            text.append(line)
            index += 1
            if (cnt == len(Input_file)):
                flag = 1
            #print (index, line)
        if (index >= 78) or (flag == 1):
            index = 0
            text_str = '\n'.join(text)
            print(text_str)
            ner_dict = nlp.ner(text_str)
            print(ner_dict)
            entity = ner_dict[0]['entity']
            word = ner_dict[0]['word']
            tag = ner_dict[0]['tag']
            print(entity, word)
            entities.append(entity)
            tags.append(tag)
            words.append(word)
            text = []
#text.append(line)

    with open(Output_file, 'w', encoding='utf-8') as f:
        for entity, word in zip(entities, words):
            print(entity, word)
            lastindex = 0
# -*- encoding: utf-8 -*-
from __future__ import print_function, unicode_literals

from bosonnlp import BosonNLP

# 注意:在测试时请更换为您的API token
nlp = BosonNLP('VaUKhf7X.7870.xbHiGWB_gx49')
s = ['中新网周口9月15日电(刘鹏) 15日,针对媒体报道的河南省太康县女子在当地一家KTV遭3名协警暴力殴打一事,太康县警方向记者回复称,3名打人者中两名为协警身份,其中一名协警未参与打架,但目前两名协警均被辞退。而当晚一同前往KTV娱乐的一名正式女民警被关禁闭。  据之前媒体报道,今年9月4日晚11时左右,太康县一家KTV内,一名姜姓女士在送走一位朋友后正返回KTV时,在门口碰到正从里面出来的三名男子。其中一名男子对姜女士动手动脚,另一男子则说姜女士为“小姐”。  受到羞辱的姜女士要求对方赔礼道歉。没想到竟遭到了三名男子拳脚相加。据姜女士反映,事发当晚黑衣男子对她一番推搡致其头部撞到门上;绿衣男子则直接拽着她的头发将其摁倒在地,随后又遭到了拳头打脸、脚踹并拉着衣服将其头往门上撞。姜女士试图报警,结果三名男子将其手机夺走摔到地上。为了阻止围观群众报警,白衣男子直接拿出“警官证”,称自己是刑警队人员,若是报警,不把录像删了,就把KTV店给砸了。  15日上午,太康县公安局发布对此事件的调查处理通报。通报称,9月4日晚,葛某(太康县人,无业)、师某(协警)等人到盛世年华夜总会唱歌,当晚23时结束后,师某、葛某与姜某发生争执吵骂,并引起厮打,致使姜某轻微伤。目前双方已达成调解协议,姜某对师某、葛某达成谅解。  太康县公安局负责处理此事的王姓警官透露,事发当晚,和打人者葛某、师某一同前往KTV娱乐的还有该局一名刚入职不久的女民警李某某及协警司某等人,但他们并未参与打架。后经太康县公安局党委研究决定,对违规进入娱乐场所的民警李某某先行禁闭,待调查结束后再做处理;对违规进入娱乐场所的协警师某、司某予以辞退。'
     '纪检部门仍在调查之中。成都商报记者 姚永']
result = nlp.ner(s)[0]
words = result['word']
entities = result['entity']


for entity in entities:
    print(''.join(words[entity[0]:entity[1]]), entity[2])

print(s)
result = nlp.sentiment(s)
print(result)
Пример #18
0
class BosonNlpp:
    def __init__(self):
        self.bonlp = BosonNLP('IKBIoANy.14545.A7GCYBnT9jIB')

    #情感分析
    def testSentiment(self, s):
        result = self.bonlp.sentiment(s)
        return result
        #print(result)

    #命名实体识别
    def lexicalAnalysis(self, s):
        result = self.bonlp.ner(s)[0]
        return result

    #依存文法分析
    def textDependency(self, s):
        result = self.bonlp.depparser(s)
        return result

    #关键词提取
    def testKeywords(self, s):
        result = self.bonlp.extract_keywords(s, top_k=10)
        return result

    #新闻分类
    def textClassify(self, s):
        resultlist = self.bonlp.classify(s)
        classifys = {
            0: '体育',
            1: '教育',
            2: '财经',
            3: '社会',
            4: '娱乐',
            5: '军事',
            6: '国内',
            7: '科技',
            8: '互联网',
            9: '房产',
            10: '国际',
            11: '女人',
            12: '汽车',
            13: '游戏'
        }
        return (classifys[resultlist[0]])

    #语义联想
    def lexicalSynonym(self, term):
        result = self.bonlp.suggest(term, top_k=10)
        return result

    #分词与词性标注
    def fenci(self, s):
        result = self.bonlp.tag(s)
        return result

    def newssubstract(self, s):
        #s=s.encode('utf8')
        s = s.decode('utf-8')
        result = self.bonlp.summary('', s)
        return result
Пример #19
0
def bosonNer(text, sensitivity):
    nlp = BosonNLP('O8M_j1Nd.4200.wIlhsL46w9-C')
    return nlp.ner(text, sensitivity)
Пример #20
0
def bosonNer(text, sensitivity):
    nlp = BosonNLP('qJWJc-f3.4334.MamzfHZ-9wUL')
    return nlp.ner(text, sensitivity)
### 4. 命名实体的识别

from bosonnlp import BosonNLP

words_list = list()

nlp = BosonNLP('g8lQg9Mv.25818.fAbbwt6TYhh8') # 使用token
result = nlp.tag('承德市长江大桥')

print(result)
print(result[0]['word'])
print(result[0]['tag'])

for i in range(len(result[0]['word'])):
    print(result[0]['word'][i] + '/' + result[0]['tag'][i], end=' ')
print()

print(' '.join([a + '/' + b for a, b in zip(result[0]['word'], result[0]['tag'])]))

# sensitivity (int 默认为 3) – 准确率与召回率之间的平衡, 设置成 1 能找到更多的实体,设置成 5 能以更高的精度寻找实体。
sentence = '美国国防部发言人威廉斯说,伊拉克持续将五艘共约载有万桶原油的超级油轮,与距科威特海岸五公里处的海岛石油转运站的原油倾入北波斯湾。'
result = nlp.ner(sentence, sensitivity = 2)

print(result[0]['word'])
print(result[0]['tag'])
print(result[0]['entity'])


for s, e, entity in result[0]['entity']:
    print('%-14s\\t%s' % (''.join(result[0]['word'][s:e]), entity))"