Exemplo n.º 1
0
def maintain():
    apitoken = "ZUKLt9tO.24611.KI1wUPXknGRP"
    nlp = BosonNLP(apitoken)
    conn = pymysql.connect(host='123.206.68.192',
                           port=3306,
                           user='******',
                           passwd='',
                           db='news',
                           charset='utf8')
    cur = conn.cursor()
    cur.execute("DELETE FROM `data` WHERE context = 'error'")
    conn.commit()

    cur.execute("SELECT * FROM `data` WHERE	abstract = 'error'")
    data = cur.fetchall()
    for entry in data:
        result = nlp.summary('', entry[3], 50).replace('\n', '')
        if (result == 'error'):
            print '[Deleted]wrong entry: ' + entry
            cur.execute("DELETE FROM `data` WHERE ID = %s", (entry[0]))
        else:
            cur.execute("UPDATE `data` SET abstract = %s WHERE ID = %s",
                        (result, entry[0]))

    cur.close()
    conn.commit()
    conn.close()
Exemplo n.º 2
0
def getAbstract(allContext):
    nlp = BosonNLP(apitoken)
    ret = []
    for i, text in enumerate(allContext):
        try:
            print("handling %dth abstract from buaa" % (i + 1))
            result = nlp.summary('', text, 50)
            ret.append(result.replace('\n', ''))
        except:
            print("error when handling %dth abstract from buaa" % (i + 1))
            ret.append('error')
            print(traceback.print_exc())
    return ret
Exemplo n.º 3
0
 def getAbstract(self, allContext):
     apitoken = "XB2l3mQj.14588.GJCICyNoqghJ"
     nlp = BosonNLP(apitoken)
     ret = []
     for i, text in enumerate(allContext):
         try:
             print("handling %dth abstract from %s" % (i + 1, self._school))
             result = nlp.summary('', text, 50)
             ret.append(result.replace('\n', ''))
         except:
             print("error when handling %dth abstract from %s" %
                   (i + 1, self._school))
             ret.append('error')
             print(traceback.print_exc())
     return ret
Exemplo n.º 4
0
def abstract(text):
    nlp = BosonNLP("x-gOGutn.27554.G6_6QvdJafES")
    rest = nlp.summary("", text)
    plt.figure(figsize=(10, 5))
    plt.subplot(3, 3, 2)
    plt.axis([0, 20, 0, 10])
    plt.rcParams['font.sans-serif'] = ['SimHei']
    rest = list(rest)
    for i in range(len(rest)):
        if i and i % 60 == 0:
            rest[i] += "\n"
    rest = "".join(rest)
    plt.title("摘要提取")
    plt.text(5,
             10,
             rest,
             fontsize=10,
             style='oblique',
             ha='center',
             va='top',
             wrap=True)
    plt.axis("off")
    plt.show()
Exemplo n.º 5
0
class _BosonNLPWrapper(object):
    """
    NLP object using the BosonNLP API Python SDK.
    """

    news_categories = [
        'physical education', 'education', 'finance', 'society',
        'entertainment', 'military', 'domestic', 'science and technology',
        'the internet', 'real estate', 'international', 'women', 'car', 'game'
    ]

    def __init__(self, api_token=None):
        try:
            assert api_token is not None, "Please provide an API token"
        except AssertionError as e:
            raise

        self.token = api_token
        self.nlp = BosonNLP(self.token)

    def get_sentiment(self, text):

        pos, neg = self.nlp.sentiment(text)[0]

        return {'positive': pos, 'negative': neg}

    def classify_news(self, text):

        numbering = range(len(_BosonNLPWrapper.news_categories))
        cats_dict = dict(zip(numbering, _BosonNLPWrapper.news_categories))

        clsfy_num = self.nlp.classify(text)[0]

        return cats_dict[clsfy_num]

    def extract_keywords(self, text, top_k=3):

        result = self.nlp.extract_keywords(
            text, top_k)  # outputs in sorted order of weight

        return [{result[i][1]: result[i][0]} for i in range(len(result))]

    def segment_words_and_tag(self, text):
        """
        Splits up text into segments of "words" and tags them with their respective part of speech.
        See: http://docs.bosonnlp.com/tag.html

        Parameters
        ----------
        text (string): text passage to segment into separate "words" and tags them with parts of speech

        Returns
        -------
        list of key-value pairs {word: part-of-speech-tag}
        """
        result = self.nlp.tag(text)[0]
        words = result['word']
        tags = result['tag']

        return [{words[i]: tags[i]} for i in range(len(words))]

    def get_summary(self, content, title='', pct_limit=0.2):
        """
        Extracts a new digest (summary) of the content.
        See: http://docs.bosonnlp.com/summary.html

        Parameters
        ----------
        text (string): text passage to summarize
        title (string): title of the passage (optional, may provide more accurate results)
        pct_limit (float): max length of the summary in terms of percentage of the original word count

        Returns
        -------
        string containing the summary of the passage
        """
        summary = self.nlp.summary(title, content, pct_limit)

        return summary
Exemplo n.º 6
0
class BosonNlpp:
    def __init__(self):
        self.bonlp = BosonNLP('IKBIoANy.14545.A7GCYBnT9jIB')

    #情感分析
    def testSentiment(self, s):
        result = self.bonlp.sentiment(s)
        return result
        #print(result)

    #命名实体识别
    def lexicalAnalysis(self, s):
        result = self.bonlp.ner(s)[0]
        return result

    #依存文法分析
    def textDependency(self, s):
        result = self.bonlp.depparser(s)
        return result

    #关键词提取
    def testKeywords(self, s):
        result = self.bonlp.extract_keywords(s, top_k=10)
        return result

    #新闻分类
    def textClassify(self, s):
        resultlist = self.bonlp.classify(s)
        classifys = {
            0: '体育',
            1: '教育',
            2: '财经',
            3: '社会',
            4: '娱乐',
            5: '军事',
            6: '国内',
            7: '科技',
            8: '互联网',
            9: '房产',
            10: '国际',
            11: '女人',
            12: '汽车',
            13: '游戏'
        }
        return (classifys[resultlist[0]])

    #语义联想
    def lexicalSynonym(self, term):
        result = self.bonlp.suggest(term, top_k=10)
        return result

    #分词与词性标注
    def fenci(self, s):
        result = self.bonlp.tag(s)
        return result

    def newssubstract(self, s):
        #s=s.encode('utf8')
        s = s.decode('utf-8')
        result = self.bonlp.summary('', s)
        return result
Exemplo n.º 7
0
class _BosonNLPWrapper(object):
    """
    NLP object using the BosonNLP API Python SDK.
    """

    news_categories = ['physical education', 'education', 'finance', 'society', 'entertainment', 'military',
                       'domestic', 'science and technology', 'the internet', 'real estate', 'international',
                       'women', 'car', 'game']

    def __init__(self, api_token=None):
        try:
            assert api_token is not None, "Please provide an API token"
        except AssertionError as e:
            raise

        self.token = api_token
        self.nlp = BosonNLP(self.token)


    def get_sentiment(self, text):
        """
        Performs sentiment analysis on a text passage (works for Chinese text).
        See: http://docs.bosonnlp.com/sentiment.html

        Parameters
        ----------
        text (string): text passage to be analyzed for sentiment


        Returns
        -------
        dictionary with 'positive' and 'negative' as keys with their respective weights as values

        >>> nlp = BosonNLPWrapper('')
        >>> nlp.get_sentiment('不要打擾我')
        {'positive': 0.3704911989140307, 'negative': 0.6295088010859693}
        >>> nlp.get_sentiment('我很高興跟你見面')
        {'positive': 0.856280735624867, 'negative': 0.14371926437513308}
        """
        pos, neg = self.nlp.sentiment(text)[0]

        return {'positive': pos, 'negative': neg}


    def classify_news(self, text):
        """
        Classifies news text into 14 different categories.
        See: http://docs.bosonnlp.com/classify.html

        Parameters
        ----------
        text (string): text passage to classify into news categories defined in news_categories

        Returns
        -------
        one of the 14 categories in news_categories that the text was classified into
        """
        numbering = range(len(_BosonNLPWrapper.news_categories))
        cats_dict = dict(zip(numbering, _BosonNLPWrapper.news_categories))

        clsfy_num = self.nlp.classify(text)[0]

        return cats_dict[clsfy_num]


    def extract_keywords(self, text, top_k=3):
        """
        Extracts the top k keywords and the weight of each word in the text.
        See: http://docs.bosonnlp.com/keywords.html

        Parameters
        ----------
        text (string): text passage from which to extract keywords
        top_k (integer): number of keywords to return

        Returns
        -------
        list of key-value pairs {word: weight}


        >>> nlp = BosonNLPWrapper('')
        >>> nlp.extract_keywords('我最愛老虎堂,奶茶香醇,波霸彈Q 好香的黑糖味')
        [{'波霸彈': 0.5980681967308248}, {'黑糖': 0.4699792421671365}, {'香醇': 0.4497614275300947}]
        """
        result = self.nlp.extract_keywords(text, top_k)  # outputs in sorted order of weight

        return [{result[i][1]: result[i][0]} for i in range(len(result))]


    def segment_words_and_tag(self, text):
        """
        Splits up text into segments of "words" and tags them with their respective part of speech.
        See: http://docs.bosonnlp.com/tag.html

        Parameters
        ----------
        text (string): text passage to segment into separate "words" and tags them with parts of speech

        Returns
        -------
        list of key-value pairs {word: part-of-speech-tag}
        """
        result = self.nlp.tag(text)[0]
        words = result['word']
        tags = result['tag']

        return [{words[i]: tags[i]} for i in range(len(words))]


    def get_summary(self, content, title='', pct_limit=0.2):
        """
        Extracts a new digest (summary) of the content.
        See: http://docs.bosonnlp.com/summary.html

        Parameters
        ----------
        text (string): text passage to summarize
        title (string): title of the passage (optional, may provide more accurate results)
        pct_limit (float): max length of the summary in terms of percentage of the original word count

        Returns
        -------
        string containing the summary of the passage
        """
        summary = self.nlp.summary(title, content, pct_limit)

        return summary