def maintain(): apitoken = "ZUKLt9tO.24611.KI1wUPXknGRP" nlp = BosonNLP(apitoken) conn = pymysql.connect(host='123.206.68.192', port=3306, user='******', passwd='', db='news', charset='utf8') cur = conn.cursor() cur.execute("DELETE FROM `data` WHERE context = 'error'") conn.commit() cur.execute("SELECT * FROM `data` WHERE abstract = 'error'") data = cur.fetchall() for entry in data: result = nlp.summary('', entry[3], 50).replace('\n', '') if (result == 'error'): print '[Deleted]wrong entry: ' + entry cur.execute("DELETE FROM `data` WHERE ID = %s", (entry[0])) else: cur.execute("UPDATE `data` SET abstract = %s WHERE ID = %s", (result, entry[0])) cur.close() conn.commit() conn.close()
def getAbstract(allContext): nlp = BosonNLP(apitoken) ret = [] for i, text in enumerate(allContext): try: print("handling %dth abstract from buaa" % (i + 1)) result = nlp.summary('', text, 50) ret.append(result.replace('\n', '')) except: print("error when handling %dth abstract from buaa" % (i + 1)) ret.append('error') print(traceback.print_exc()) return ret
def getAbstract(self, allContext): apitoken = "XB2l3mQj.14588.GJCICyNoqghJ" nlp = BosonNLP(apitoken) ret = [] for i, text in enumerate(allContext): try: print("handling %dth abstract from %s" % (i + 1, self._school)) result = nlp.summary('', text, 50) ret.append(result.replace('\n', '')) except: print("error when handling %dth abstract from %s" % (i + 1, self._school)) ret.append('error') print(traceback.print_exc()) return ret
def abstract(text): nlp = BosonNLP("x-gOGutn.27554.G6_6QvdJafES") rest = nlp.summary("", text) plt.figure(figsize=(10, 5)) plt.subplot(3, 3, 2) plt.axis([0, 20, 0, 10]) plt.rcParams['font.sans-serif'] = ['SimHei'] rest = list(rest) for i in range(len(rest)): if i and i % 60 == 0: rest[i] += "\n" rest = "".join(rest) plt.title("摘要提取") plt.text(5, 10, rest, fontsize=10, style='oblique', ha='center', va='top', wrap=True) plt.axis("off") plt.show()
class _BosonNLPWrapper(object): """ NLP object using the BosonNLP API Python SDK. """ news_categories = [ 'physical education', 'education', 'finance', 'society', 'entertainment', 'military', 'domestic', 'science and technology', 'the internet', 'real estate', 'international', 'women', 'car', 'game' ] def __init__(self, api_token=None): try: assert api_token is not None, "Please provide an API token" except AssertionError as e: raise self.token = api_token self.nlp = BosonNLP(self.token) def get_sentiment(self, text): pos, neg = self.nlp.sentiment(text)[0] return {'positive': pos, 'negative': neg} def classify_news(self, text): numbering = range(len(_BosonNLPWrapper.news_categories)) cats_dict = dict(zip(numbering, _BosonNLPWrapper.news_categories)) clsfy_num = self.nlp.classify(text)[0] return cats_dict[clsfy_num] def extract_keywords(self, text, top_k=3): result = self.nlp.extract_keywords( text, top_k) # outputs in sorted order of weight return [{result[i][1]: result[i][0]} for i in range(len(result))] def segment_words_and_tag(self, text): """ Splits up text into segments of "words" and tags them with their respective part of speech. See: http://docs.bosonnlp.com/tag.html Parameters ---------- text (string): text passage to segment into separate "words" and tags them with parts of speech Returns ------- list of key-value pairs {word: part-of-speech-tag} """ result = self.nlp.tag(text)[0] words = result['word'] tags = result['tag'] return [{words[i]: tags[i]} for i in range(len(words))] def get_summary(self, content, title='', pct_limit=0.2): """ Extracts a new digest (summary) of the content. See: http://docs.bosonnlp.com/summary.html Parameters ---------- text (string): text passage to summarize title (string): title of the passage (optional, may provide more accurate results) pct_limit (float): max length of the summary in terms of percentage of the original word count Returns ------- string containing the summary of the passage """ summary = self.nlp.summary(title, content, pct_limit) return summary
class BosonNlpp: def __init__(self): self.bonlp = BosonNLP('IKBIoANy.14545.A7GCYBnT9jIB') #情感分析 def testSentiment(self, s): result = self.bonlp.sentiment(s) return result #print(result) #命名实体识别 def lexicalAnalysis(self, s): result = self.bonlp.ner(s)[0] return result #依存文法分析 def textDependency(self, s): result = self.bonlp.depparser(s) return result #关键词提取 def testKeywords(self, s): result = self.bonlp.extract_keywords(s, top_k=10) return result #新闻分类 def textClassify(self, s): resultlist = self.bonlp.classify(s) classifys = { 0: '体育', 1: '教育', 2: '财经', 3: '社会', 4: '娱乐', 5: '军事', 6: '国内', 7: '科技', 8: '互联网', 9: '房产', 10: '国际', 11: '女人', 12: '汽车', 13: '游戏' } return (classifys[resultlist[0]]) #语义联想 def lexicalSynonym(self, term): result = self.bonlp.suggest(term, top_k=10) return result #分词与词性标注 def fenci(self, s): result = self.bonlp.tag(s) return result def newssubstract(self, s): #s=s.encode('utf8') s = s.decode('utf-8') result = self.bonlp.summary('', s) return result
class _BosonNLPWrapper(object): """ NLP object using the BosonNLP API Python SDK. """ news_categories = ['physical education', 'education', 'finance', 'society', 'entertainment', 'military', 'domestic', 'science and technology', 'the internet', 'real estate', 'international', 'women', 'car', 'game'] def __init__(self, api_token=None): try: assert api_token is not None, "Please provide an API token" except AssertionError as e: raise self.token = api_token self.nlp = BosonNLP(self.token) def get_sentiment(self, text): """ Performs sentiment analysis on a text passage (works for Chinese text). See: http://docs.bosonnlp.com/sentiment.html Parameters ---------- text (string): text passage to be analyzed for sentiment Returns ------- dictionary with 'positive' and 'negative' as keys with their respective weights as values >>> nlp = BosonNLPWrapper('') >>> nlp.get_sentiment('不要打擾我') {'positive': 0.3704911989140307, 'negative': 0.6295088010859693} >>> nlp.get_sentiment('我很高興跟你見面') {'positive': 0.856280735624867, 'negative': 0.14371926437513308} """ pos, neg = self.nlp.sentiment(text)[0] return {'positive': pos, 'negative': neg} def classify_news(self, text): """ Classifies news text into 14 different categories. See: http://docs.bosonnlp.com/classify.html Parameters ---------- text (string): text passage to classify into news categories defined in news_categories Returns ------- one of the 14 categories in news_categories that the text was classified into """ numbering = range(len(_BosonNLPWrapper.news_categories)) cats_dict = dict(zip(numbering, _BosonNLPWrapper.news_categories)) clsfy_num = self.nlp.classify(text)[0] return cats_dict[clsfy_num] def extract_keywords(self, text, top_k=3): """ Extracts the top k keywords and the weight of each word in the text. See: http://docs.bosonnlp.com/keywords.html Parameters ---------- text (string): text passage from which to extract keywords top_k (integer): number of keywords to return Returns ------- list of key-value pairs {word: weight} >>> nlp = BosonNLPWrapper('') >>> nlp.extract_keywords('我最愛老虎堂,奶茶香醇,波霸彈Q 好香的黑糖味') [{'波霸彈': 0.5980681967308248}, {'黑糖': 0.4699792421671365}, {'香醇': 0.4497614275300947}] """ result = self.nlp.extract_keywords(text, top_k) # outputs in sorted order of weight return [{result[i][1]: result[i][0]} for i in range(len(result))] def segment_words_and_tag(self, text): """ Splits up text into segments of "words" and tags them with their respective part of speech. See: http://docs.bosonnlp.com/tag.html Parameters ---------- text (string): text passage to segment into separate "words" and tags them with parts of speech Returns ------- list of key-value pairs {word: part-of-speech-tag} """ result = self.nlp.tag(text)[0] words = result['word'] tags = result['tag'] return [{words[i]: tags[i]} for i in range(len(words))] def get_summary(self, content, title='', pct_limit=0.2): """ Extracts a new digest (summary) of the content. See: http://docs.bosonnlp.com/summary.html Parameters ---------- text (string): text passage to summarize title (string): title of the passage (optional, may provide more accurate results) pct_limit (float): max length of the summary in terms of percentage of the original word count Returns ------- string containing the summary of the passage """ summary = self.nlp.summary(title, content, pct_limit) return summary