def NLP_SDK(text, method='depParser', APP_ID=_APP_ID, API_KEY=_API_KEY, SECRET_KEY=_SECRET_KEY, **options): ''' 依据百度SDK官方文档的参数名称不做任何修改:<https://ai.baidu.com/docs#/NLP-Python-SDK/top> :param texts: 需要打标签的文档,部分方法需要以列表形式给出配对 :param method: 功能名称 :param APP_ID: 项目账号信息 :param API_KEY: 项目账号信息 :param SECRET_KEY: 项目账号信息 :param options: 其他可选参数 :return: 返回百度SDK返回结果 method 功能名称全体: 词法分析 lexer,词法分析(定制版)lexerCustom,依存句法分析 depParser, 词向量表示 wordEmbedding,DNN语言模型 dnnlm,词义相似度 wordSimEmbedding,短文本相似度 wordSimEmbedding, 评论观点抽取 commentTag,情感倾向分析 sentimentClassify,文章标签 keyword,文章分类 topic ''' client = AipNlp(APP_ID, API_KEY, SECRET_KEY) # 词法分析 if method == 'lexer': result = client.lexer(text, **options) # 词法分析(定制版) elif method == 'lexerCustom': result = client.lexerCustom(text, **options) # 依存句法分析 elif method == 'depParser': result = client.depParser(text, **options) # 词向量表示 elif method == 'wordEmbedding': result = client.wordEmbedding(text, **options) # DNN语言模型 elif method == 'dnnlm': result = client.dnnlm(text, **options) # 词义相似度 elif method == 'wordSimEmbedding': word1, word2 = text[0], text[1] result = client.wordSimEmbedding(word1, word2, **options) # 短文本相似度 elif method == 'simnet': text1, text2 = text[0], text[1] result = client.simnet(text1, text2, **options) # 评论观点抽取 elif method == 'commentTag': result = client.commentTag(text, **options) # 情感倾向分析 elif method == 'sentimentClassify': result = client.sentimentClassify(text, **options) # 文章标签 elif method == 'keyword': title, content = text[0], text[1] result = client.keyword(title, content, **options) # 文章分类 elif method == 'topic': title, content = text[0], text[1] result = client.topic(title, content, **options) return result
class BaiduNLP: def __init__(self): self.APP_ID = '9519234' self.API_KEY = 'CIwEvSR9m9hEWnQp2GK7LGKI' self.SECRET_KEY = 's4hA4YTO1SjqIkRzTCT5uHSa715BKHFL' self.baiduNlp = AipNlp(self.APP_ID, self.API_KEY, self.SECRET_KEY) '''----------分词-----------''' def wordseg(self, words): return self.baiduNlp.wordseg(words) '''----------词性标注-----------''' def wordpos(self, words): return self.baiduNlp.wordpos(words) '''----------向量表示-----------''' def wordembedding(self, words1, words2=''): return self.baiduNlp.wordembedding(words1, words2) '''----------评论观点抽取,默认7教育-----------''' def commenttag(self, words, type=7): comment = self.baiduNlp.commentTag(words, type) #返回处理 commentTags = comment[u'tags'] #得到评论观点,可能有多条评论观点 length = len(commentTags) #得到有几个评论观点 validComment = [] for i in range(length): temp = commentTags[i] abstract = self.__deleteUnvalid(temp[u'abstract']) tempComement = { u'abstract': abstract, u'adj': temp[u'adj'], u'fea': temp[u'fea'], u'type': temp[u'type'] } validComment.append(tempComement) return validComment '''----------dnn语言模型-----------''' def dnnlm(self, words): return self.baiduNlp.dnnlm(words) '''----------短文相似度-----------''' def simnet(self, essay1, essay2): return self.baiduNlp.simnet(essay1, essay2) def __deleteUnvalid(self, sentence): #去除</span> abstract = sentence.replace("<span>", "") abstract = abstract.replace("</span>", "") return abstract
class NLP(): def __init__(self): """ 你的 APPID AK SK """ APP_ID = '16043979' API_KEY = 'vr2XhyMVrjW7dWZOZjqeLsae' SECRET_KEY = 'RypiqTeFnVIED0zpKOxRIZHbc5a8a2wE' self.client = AipNlp(APP_ID, API_KEY, SECRET_KEY) def lexical_analysis(self): '''词法分析接口向用户提供分词、词性标注、专名识别三大功能; 能够识别出文本串中的基本词汇(分词),对这些词汇进行重组、标注组合后词汇的词性, 并进一步识别出命名实体''' text = "百度是一家高科技公司" """ 调用词法分析 """ resp = self.client.lexer(text) print(resp) def Interdependent(self): '''依存句法分析 依存句法分析接口可自动分析文本中的依存句法结构信息, 利用句子中词与词之间的依存关系来表示词语的句法结构信息(如“主谓”、“动宾”、“定中”等结构关系), 并用树状结构来表示整句的结构(如“主谓宾”、“定状补”等)。''' text = '小情歌' resp = self.client.depParser(text) print(resp) def vector(self): '''词向量分析 词向量表示接口提供中文词向量的查询功能。''' word = '张飞' resp = self.client.wordEmbedding(word) print(resp) def Dnn(self): '''中文DNN语言模型接口用于输出切词结果并给出每个词在句子中的概率值,判断一句话是否符合语言表达习惯。''' # word = '你饭吃在哪?' word = '你今天上班了吗' resp = self.client.dnnlm(word) # ppl float 描述句子通顺的值:数值越低,句子越通顺 print(resp) print(resp['ppl']) def compare(self): # 需要字数相等 '''词义相似度''' word1 = '茶壶' word2 = '水瓶' resp = self.client.wordSimEmbedding(word1, word2) print(resp) # score 相似度分数,分数越接近1越相似 print(resp['score']) def text_compare(self): # 短文本相似度 text1 = "穿衣裳" text2 = "穿衣服" """ 调用短文本相似度 """ resp = self.client.simnet(text1, text2) print(resp) def comment(self): '''评论观点抽取''' text = '苹果笔记本后盖不好看' """ 如果有可选参数 """ options = {} options["type"] = 13 """ 带参数调用评论观点抽取 """ resp = self.client.commentTag(text, options) print(resp) print(resp['items']) def emotion(self): # 情感分析 text = '今天天气不错' resp = self.client.sentimentClassify(text) print(resp) print(resp['items']) print('积极情绪概率:%s' % resp['items'][0]['positive_prob']) print('消极情绪概率:%s' % resp['items'][0]['negative_prob']) def Tag(self): '''文章标签''' # 文章标签服务能够针对网络各类媒体文章进行快速的内容理解,根据输入含有标题的文章, # 输出多个内容标签以及对应的置信度,用于个性化推荐、相似文章聚合、文本内容分析等场景。 title = "iphone手机出现“白苹果”原因及解决办法,用苹果手机的可以看下" content = "如果下面的方法还是没有解决你的问题建议来我们门店看下成都市锦江区红星路三段99号银石广场24层01室。" """ 调用文章标签 """ resp = self.client.keyword(title, content) print(resp) def Ar_classification(self): '''文章分类''' title = "美男齐聚!吴彦祖冯德伦谢霆锋一起颁奖" content = "今晚的金像奖,《特警新人类》主演吴彦祖、冯德伦、谢霆锋、李璨琛一起颁奖,今年是电影上映二十年。一开始只有冯德伦、李璨琛上台,说“他们两个有事来不了”,随后吴彦祖和谢霆锋也从VCR中“走”到了台上,他们现场问大家想不想看《特警新人类3》,气氛热烈。" """ 调用文章分类 """ # 可能一个文章有多个分类 resp = self.client.topic(title, content) print(resp) print(resp['item']) def modify(self): '''文本纠错''' text = "只能门锁" """ 调用文本纠错 """ resp = self.client.ecnet(text) print(resp) print(resp['item']) print('文本错误,正确结果:%s' % resp['item']['correct_query']) def emotion_qingxu(self): text = '今天本来高兴的' """ 如果有可选参数 """ options = {} options["scene"] = "default" """ 带参数调用对话情绪识别接口 """ resp = self.client.emotion(text, options) print(resp) print(resp['items']) print(type(resp['items'])) print('回复:%s' % resp['items'][0]['replies']) def News(self): '''新闻摘要''' # 没有勾选此接口 content = "麻省理工学院的研究团队为无人机在仓库中使用RFID技术进行库存查找等工作,创造了一种..." maxSummaryLen = 300 """ 调用新闻摘要接口 """ resp = self.client.newsSummary(content, maxSummaryLen) print(resp)
# 中文词向量表示 result = aipNlp.wordEmbedding(title[1]) print(result) # 传入两个词计算两者相似度 result = aipNlp.wordSimEmbedding('漂亮', '美丽') print(result) # 情感倾向分析 result = aipNlp.sentimentClassify('Python具有丰富和强大的库') # +sentiment表示情感极性分类结果, 0:负向,1:中性,2:正向 print(result) # 传入短语,计算中文DNN语言模型,语法结构分析 result = aipNlp.dnnlm('python是程序设计语言') print(result) # 传入两个短文本,计算相似度 result = aipNlp.simnet('python是程序设计语言', 'c是程序设计语言') # score两个文本相似度得分 print(result) # 传入评论文本,获取情感属性 result = aipNlp.commentTag('面包很好吃') print(result) # 依存句法分析 result = aipNlp.depParser('python是最好的语言') print(result)
# coding:utf-8 # Created by xudas on 2017/6/12. from aip import AipNlp as Nlp """ 输入 comments.txt 一行为一句评论的txt文件 输出 dict.txt 每一行的开头是属性词,后面的词是对这个词的形容词(不重复) 以上 """ # change to your sk = 'T80yWu0WAkbOFoKRVQ9p8lZMzj6rLq7S' client = Nlp('9688683', 'wndZFKVBmUTM5cfMb7C8UaOA', 'T80yWu0WAkbOFoKRVQ9p8lZMzj6rLq7S') response = client.commentTag('超爱你的') print(response) exit()
""" APPID AK SK """ APP_ID = '11506585' API_KEY = 'HnN0yHhGevtKHSK5wLu9qndH' SECRET_KEY = 'qsXCP39wozNaQzXD6pVTrlDyjlah6SCI' client = AipNlp(APP_ID, API_KEY, SECRET_KEY) with open("../data/bose-comments-tag", 'wb') as out: with open("../data/bose-comments", "rb") as fobj: lines = fobj.readlines() for line in lines: comment = line.decode() print(comment) out.write(comment.encode('utf-8')) options = {} options["type"] = 13 """ 带参数调用评论观点抽取 """ try: resp = client.commentTag(comment, options) sent = client.sentimentClassify(comment) except Exception as e: resp = e.__str__() pretty_resp = json.dumps(resp, indent=4, ensure_ascii=False) pretty_sent = json.dumps(sent, indent=4, ensure_ascii=False) print(pretty_resp) out.write(pretty_resp.encode('utf-8')) print(pretty_sent) out.write(pretty_sent.encode('utf-8')) print(' -------- ') out.write(' -------- '.encode('utf-8')) time.sleep(0.2)
class NLP(object): def __init__(self, APP_ID, API_KEY, SECRET_KEY): """ Parameters: """ self.client = AipNlp(APP_ID, API_KEY, SECRET_KEY) def commentTag(self, News_Series, type_=4): """ Parameters: type_:which category you want to match News_Series:新闻文本序列 ----------------------------------------------- Returns: a dataframe which has columns: log_id prop adj sentiment begin_pos end_pos abstract """ options = {} options["type"] = type_ def f(News_Series): for text in News_Series: res = self.client.commentTag(text, options) #返回json格式的结果 print(res) result = yield { "log_id": res["log_id"], "prop": res["items"][0]["prop"], "adj": res["items"][0]["adj"], "sentiment": res["items"][0]["sentiment"], "begin_pos": res["items"][0]["begin_pos"], "end_pos": res["items"][0]["end_pos"], "abstract": res["items"][0]["abstract"] } return result result = f(News_Series) res_df = pd.DataFrame(result) return res_df def sentiment(self, text_series): import time """ Parameters: text:string ------------------------------ Returns: DataFrame which has columns: text--->str; sentiment--->int;表示情感极性分类结果,0:负向,1:中性,2:正向 confidence--->float;表示分类的置信度 positive_prob--->float;表示属于积极类别的概率 negative_prob--->float;表示属于消极类别的概率 ------------------------------ """ df_sentiment = pd.DataFrame() results = [] for text in text_series: results.append(self.client.sentimentClassify(text)) #速度问题 time.sleep(3) #防止请求过快 text = [result["text"] for result in results] sentiment = [result["items"][0]["sentiment"] for result in results] confidence = [result["items"][0]["confidence"] for result in results] positive_prob = [ result["items"][0]["positive_prob"] for result in results ] negative_prob = [ result["items"][0]["negative_prob"] for result in results ] df_sentiment["text"] = text df_sentiment["sentiment"] = sentiment df_sentiment["confidence"] = confidence df_sentiment["positive_prob"] = positive_prob df_sentiment["negative_prob"] = negative_prob return df_sentiment, text, sentiment def keyword(self, title, content): """ Parameters: ------------------------------- Returns: ------------------------------- """ result = self.client.keyword(title, content) return result def topic(self, title, content): """ Parameters: -------------------------------- Returns: -------------------------------- """ result = self.client.topic(title, content) return result
# 引入NLP SDK from aip import AipNlp import json # 定义常量 APP_ID = '9993998' API_KEY = 'bkEaVju6jjgSp90xlWE03RLB' SECRET_KEY = 'xVYqqNPvpBDXh6Hpa7sUKisMbnD4SDcC' # 初始化AipNlp对象 aipNlp = AipNlp(APP_ID, API_KEY, SECRET_KEY) # result = aipNlp.lexer('学习雷锋好榜样忠于革命忠于党') # result1 = aipNlp.sentimentClassify('你是一个大傻逼') # result2 = aipNlp.dnnlm('飞特是一个优秀的物流公司') # result3 = aipNlp.simnet('飞特是个国际物流公司', '发货去美国找飞特') # 定义参数变量 1:酒店,2:KTV ,3:丽人,4:美食(默认值),5:旅游,6:健康7:教育,8:商业,9:房产,10:汽车,11:生活,12:购物, 13: 3C # 汽车分类 option = {'type': 1} # 调用情感观点抽取接口() result4 = aipNlp.commentTag('如家很便宜', option) # print json.dumps(result, ensure_ascii=False, encoding='UTF-8') # print json.dumps(result1, ensure_ascii=False, encoding='UTF-8') # print json.dumps(result2, ensure_ascii=False, encoding='UTF-8') # print json.dumps(result3, ensure_ascii=False, encoding='UTF-8') print json.dumps(result4, ensure_ascii=False, encoding='UTF-8')
if line_count < 700: onlyCN_text = format_str(eachrow[-1]) # print(onlyCN_text) if onlyCN_text: data['aonlyCN_text'] = onlyCN_text sentiment_result = client.sentimentClassify(onlyCN_text) result = sentiment_result['items'][-1] data['bpositive_score'] = result['positive_prob'] data['cnegative_score'] = result['negative_prob'] data['dsentiment_score'] = result['sentiment'] data['econfidence_int'] = result['confidence'] time.sleep(random.uniform(1, 7)) result2 = client.commentTag(onlyCN_text, options13) if result2.__contains__('items'): data['felectronic_type'] = result2['items'][-1]['prop'] else: pass result3 = client.commentTag(onlyCN_text, options11) if result3.__contains__('items'): data['glife_type'] = result3['items'][-1]['prop'] else: pass # data=onlyCN_text,positive_score,negative_score,sentiment_score,confidence_int,electronic_type,life_type) print(data) print('running row' + str(line_count)) line_count += 1 with open('test.csv', 'a') as f: # Just use 'w' mode in 3.x
class BaiduAi: """百度ai接口 https://ai.baidu.com/docs#/NLP-Python-SDK/f524c757 """ # empCount = 0 def __init__(self): print('kaishi') # print(config.get('site', 'name')) self.app_id = config.get('baidu', 'app_id') self.api_key = config.get('baidu', 'app_key') self.secret_key = config.get('baidu', 'secret_key') self.client = AipNlp(self.app_id, self.api_key, self.secret_key) # """ 你的 APPID AK SK """ def lexer(self, text): """ 调用词法分析 """ return self.client.lexer(text) def depParser(self, text): """依存句法分析 """ return self.client.depParser(text) def dnn(self, text): # result = client.synthesis(text, 'zh', 1, { # 'vol': 11, # }) # text = "床前明月光" # """ 调用DNN语言模型 """ # print(client) return self.client.dnnlm(text) def wordSimEmbedding(self,text1, text2): """ 词义相似度 """ return self.client.wordSimEmbedding( text1, text2) def simnet(self,text1, text2): """ 短文本相似度 """ return self.client.simnet( text1, text2) def commentTag(self,content): """ 评论观点抽取 """ return self.client.commentTag( content) def topic(self,title,content): """ 调用文章分类 """ try: return self.client.topic(title, content) except: return {'log_id': 8348398184393122510, 'item': {'lv2_tag_list': [], 'lv1_tag_list': []}} def keyword(self,title,content): """ 文章标签 文章标签服务能够针对网络各类媒体文章进行快速的内容理解,根据输入含有标题的文章,输出多个内容标签以及对应的置信度,用于个性化推荐、相似文章聚合、文本内容分析等场景。 """ return self.client.keyword(title, content) def sentimentClassify(self,content): """情感倾向分析 对包含主观观点信息的文本进行情感极性类别(积极、消极、中性)的判断,并给出相应的置信度。 """ return self.client.sentimentClassify(content) def ecnet(self,content): """智能纠错 """ return self.client.ecnet(content) def newsSummary(self,title,content): """生成摘要 暂时无权限 """ return self.client.newsSummary(content, 200)
SECRET_KEY = 'r0mHeKH7TWVpPa0weKMVMpQ2whosIPGM ' aipNlp = AipNlp(APP_ID, API_KEY, SECRET_KEY) result = aipNlp.lexer('黄健是一个帅锅') #词法分析接口包含了中文分词和词性标注的功能 for key in result: print(key, result[key]) result = aipNlp.wordEmbedding("黄健") #用于词汇数学计算,词向量 print(result) result = aipNlp.wordSimEmbedding('早饭', '早点') #词语相似度 print(result) result = aipNlp.wordSimEmbedding('帅', '英俊') #词语相似度 print(result) result = aipNlp.wordSimEmbedding('强大', '厉害') #词语相似度 print(result) result = aipNlp.sentimentClassify('这家公司差的很') #情感分析 print(result) result = aipNlp.dnnlm('百度是个搜索公司') #语法结构分析 print(result) result = aipNlp.simnet('清华学霸', '清华学渣') #词频相似度 print(result) result = aipNlp.commentTag('面包很好吃,吃的我拉肚子了') #评论观点提取,判断情感属性 print(result) result = aipNlp.depParser('百度是一家伟大的公司') print(result)
from aip import AipNlp """ 你的 APPID AK SK """ APP_ID = '10677661' API_KEY = 'CbBnylD4hQekDeGfmNRvITtL' SECRET_KEY = 'a1mjmnGiaG9ywtLP0RvbjkyIASUylsL0' client = AipNlp(APP_ID, API_KEY, SECRET_KEY) text = "百度是一家高科技公司" """ 调用词法分析 """ print(client.lexer(text)) options = {} options["type"] = 13 print(client.commentTag(text, options)) print(client.sentimentClassify(text)) options = {} options["scene"] = "talk" """ 带参数调用对话情绪识别接口 """ print(client.emotion(text, options))
connection = pymongo.MongoClient('120.25.75.23', 27017) tdb = connection.test post = tdb.comments #client = redis.Redis(host='101.236.6.203', port=6379, db=0,password='******') nlp = Nlp('9688683', 'wndZFKVBmUTM5cfMb7C8UaOA', 'T80yWu0WAkbOFoKRVQ9p8lZMzj6rLq7S') for i in range(100): r = post.find().limit(10000) t = [] for k, item in enumerate(r): if r['propadj'] == None or r['pro'] == None: continue print(k) if not item['text']: continue text = item['text'].replace('#', '') r = nlp.commentTag(text) if 'error_code' in item: if r['error_code'] != 282130: print(r) propadj = {} if 'items' in r: for it in r['items']: propadj[it['prop']] = it['adj'] post.update({"_id": item['_id']}, {"$set": { "propadj": propadj, 'pro': True }}) print(i)
text) # 词语的句法结构信息(如“主谓”、“动宾”、“定中”等结构关系),并用树状结构来表示整句的结构(如“主谓宾”、“定状补”等) word = "张飞" client.wordEmbedding(word) # 中文词向量 text = "床前明月光" client.dnnlm(text) # 输出切词结果并给出每个词在句子中的概率值,判断一句话是否符合语言表达习惯。 word1 = "北京" word2 = "上海" client.wordSimEmbedding(word1, word2) # 得到两个词的相似度 text1 = "强大" text2 = "富强" client.simnet(text1, text2) text = "三星电脑电池不给力" options = {} options["type"] = 13 client.commentTag(text, options) # 对包含主观观点信息的文本进行情感极性类别(积极、消极、中性)的判断,并给出相应的置信度 text = "苹果是一家伟大的公司" client.sentimentClassify(text) # 情感分析 title = "iphone手机出现“白苹果”原因及解决办法,用苹果手机的可以看下" content = "如果下面的方法还是没有解决你的问题建议来我们门店看下成都市锦江区红星路三段99号银石广场24层01室。" client.keyword(title, content) # 输出多个内容标签以及对应的置信度,用于个性化推荐、相似文章聚合、文本内容分析等场景 title = "欧洲冠军杯足球赛" content = "欧洲冠军联赛是欧洲足球协会联盟主办的年度足球比赛,代表欧洲俱乐部足球最高荣誉和水平,被认为是全世界最高素质、最具影响力以及最高水平的俱乐部赛事,亦是世界上奖金最高的足球赛事和体育赛事之一。" client.topic(title, content) # 对文章按照内容类型进行自动分类,首批支持娱乐、体育、科技等26个主流内容类型