Python TokenizeOnWhitespacePunctuation示例

编程语言: Python

命名空间/包名称: Classes.TokenizeOnWhitespacePunctuation

hotexamples.com的示例: 10

Python TokenizeOnWhitespacePunctuation - 已找到10个示例。这些是从开源项目中提取的最受好评的Classes.TokenizeOnWhitespacePunctuation.TokenizeOnWhitespacePunctuation现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

TokenizeOnWhitespacePunctuation(6)

getUnigrams(3)

getBigrams(2)

getBothUnigramsBigrams(2)

示例#1

显示文件

文件： ClassifyFundingTypeKeywordBased.py 项目： kyajpauley/cerebro

    def classifyOpportunity(self, title, info):
        unigramsTitle = TokenizeOnWhitespacePunctuation(
            title, applyStopwords=True).getUnigrams()
        unigramsInfo = TokenizeOnWhitespacePunctuation(
            info, applyStopwords=True).getUnigrams()

        if self.checkFellowshipKeywordsTitle(unigramsTitle):
            tag = 'Fellowship'
        elif self.checkInternshipKeywords(unigramsTitle):
            tag = 'Internship'
        elif self.checkScholarshipKeywords(unigramsTitle):
            tag = 'Scholarship'
        elif self.checkGrantKeywords(unigramsTitle):
            tag = 'Grant'
        elif self.checkFellowshipKeywordsInfo(unigramsInfo):
            tag = 'Fellowship'
        elif self.checkInternshipKeywords(unigramsInfo):
            tag = 'Internship'
        elif self.checkGrantKeywords(unigramsInfo):
            tag = 'Grant'
        elif self.checkAwardKeywords(unigramsTitle):
            tag = 'Award'
        elif self.checkScholarshipKeywords(unigramsInfo):
            tag = 'Scholarship'
        elif self.checkResearchKeywords(unigramsTitle):
            tag = 'Research'
        elif self.checkResearchKeywords(unigramsInfo):
            tag = 'Research'
        else:
            tag = 'Other'

        return tag

示例#2

显示文件

文件： TestTokenizeOnWhitespacePunctuation.py 项目： kyajmiller/Cerebro

    def test_RemoveUrls(self):
        # set up
        teststring = 'I like cats cats.org'
        unigrams = ['i', 'like', 'cats']

        # test
        testtokenize = TokenizeOnWhitespacePunctuation(teststring)
        self.assertEqual(unigrams, testtokenize.getUnigrams())

示例#3

显示文件

    def test_RemoveUrls(self):
        # set up
        teststring = 'I like cats cats.org'
        unigrams = ['i', 'like', 'cats']

        # test
        testtokenize = TokenizeOnWhitespacePunctuation(teststring)
        self.assertEqual(unigrams, testtokenize.getUnigrams())

示例#4

显示文件

    def test_TokenizeOnWhitespacePunctuationUnigrams(self):
        # set up
        teststring = 'I like cats and birds.'
        unigrams = ['i', 'like', 'cats', 'and', 'birds']

        # test
        testtokenize = TokenizeOnWhitespacePunctuation(teststring)
        self.assertEqual(unigrams, testtokenize.getUnigrams())

示例#5

显示文件

文件： TestTokenizeOnWhitespacePunctuation.py 项目： kyajmiller/Cerebro

    def test_BothUnigramsBigramsApplyStopwords(self):
        # set up
        teststring = 'I like cats and birds.'
        both = ['cats', 'birds', 'cats birds']

        # test
        testtokenize = TokenizeOnWhitespacePunctuation(teststring, applyStopwords=True)
        self.assertEqual(both, testtokenize.getBothUnigramsBigrams())

示例#6

显示文件

文件： TestTokenizeOnWhitespacePunctuation.py 项目： kyajmiller/Cerebro

    def test_TokenizeOnWhitespacePunctuationUnigrams(self):
        # set up
        teststring = 'I like cats and birds.'
        unigrams = ['i', 'like', 'cats', 'and', 'birds']

        # test
        testtokenize = TokenizeOnWhitespacePunctuation(teststring)
        self.assertEqual(unigrams, testtokenize.getUnigrams())

示例#7

显示文件

文件： TestTokenizeOnWhitespacePunctuation.py 项目： kyajmiller/Cerebro

    def test_TokenizeOnWhitespacePunctuationBothUnigramsBigrams(self):
        # set up
        teststring = 'I like cats and birds.'
        both = ['i', 'like', 'cats', 'and', 'birds', 'i like', 'like cats', 'cats and', 'and birds']

        # test
        testtokenize = TokenizeOnWhitespacePunctuation(teststring)
        self.assertEqual(both, testtokenize.getBothUnigramsBigrams())

示例#8

显示文件

    def test_BothUnigramsBigramsApplyStopwords(self):
        # set up
        teststring = 'I like cats and birds.'
        both = ['cats', 'birds', 'cats birds']

        # test
        testtokenize = TokenizeOnWhitespacePunctuation(teststring,
                                                       applyStopwords=True)
        self.assertEqual(both, testtokenize.getBothUnigramsBigrams())

示例#9

显示文件

    def test_TokenizeOnWhitespacePunctuationBothUnigramsBigrams(self):
        # set up
        teststring = 'I like cats and birds.'
        both = [
            'i', 'like', 'cats', 'and', 'birds', 'i like', 'like cats',
            'cats and', 'and birds'
        ]

        # test
        testtokenize = TokenizeOnWhitespacePunctuation(teststring)
        self.assertEqual(both, testtokenize.getBothUnigramsBigrams())

示例#10

显示文件

    def getNgrams(text, getUnigrams=True, getBigrams=True, getTrigrams=False):
        unigrams = []
        bigrams = []
        trigrams = []

        sentences = TokenizeIntoSentences().doTokenize(text)
        for sentence in sentences:
            sentenceUnigrams = TokenizeOnWhitespacePunctuation(
                sentence, keepCaps=False, applyStopwords=True).getUnigrams()
            if getUnigrams:
                for sentenceUnigram in sentenceUnigrams:
                    unigrams.append(sentenceUnigram)

            if getBigrams:
                sentenceBigrams = [
                    '%s %s' % (sentenceUnigrams[i], sentenceUnigrams[i + 1])
                    for i in range(len(sentenceUnigrams) - 1)
                ]
                for sentenceBigram in sentenceBigrams:
                    bigrams.append(sentenceBigram)

            if getTrigrams:
                sentenceTrigrams = [
                    '%s %s %s' % (sentenceUnigrams[i], sentenceUnigrams[i + 1],
                                  sentenceUnigrams[i + 2])
                    for i in range(len(sentenceUnigrams) - 2)
                ]
                for sentenceTrigram in sentenceTrigrams:
                    trigrams.append(sentenceTrigram)

        ngramsList = [unigrams, bigrams, trigrams]

        return ngramsList