def classifyOpportunity(self, title, info): unigramsTitle = TokenizeOnWhitespacePunctuation( title, applyStopwords=True).getUnigrams() unigramsInfo = TokenizeOnWhitespacePunctuation( info, applyStopwords=True).getUnigrams() if self.checkFellowshipKeywordsTitle(unigramsTitle): tag = 'Fellowship' elif self.checkInternshipKeywords(unigramsTitle): tag = 'Internship' elif self.checkScholarshipKeywords(unigramsTitle): tag = 'Scholarship' elif self.checkGrantKeywords(unigramsTitle): tag = 'Grant' elif self.checkFellowshipKeywordsInfo(unigramsInfo): tag = 'Fellowship' elif self.checkInternshipKeywords(unigramsInfo): tag = 'Internship' elif self.checkGrantKeywords(unigramsInfo): tag = 'Grant' elif self.checkAwardKeywords(unigramsTitle): tag = 'Award' elif self.checkScholarshipKeywords(unigramsInfo): tag = 'Scholarship' elif self.checkResearchKeywords(unigramsTitle): tag = 'Research' elif self.checkResearchKeywords(unigramsInfo): tag = 'Research' else: tag = 'Other' return tag
def test_RemoveUrls(self): # set up teststring = 'I like cats cats.org' unigrams = ['i', 'like', 'cats'] # test testtokenize = TokenizeOnWhitespacePunctuation(teststring) self.assertEqual(unigrams, testtokenize.getUnigrams())
def test_TokenizeOnWhitespacePunctuationUnigrams(self): # set up teststring = 'I like cats and birds.' unigrams = ['i', 'like', 'cats', 'and', 'birds'] # test testtokenize = TokenizeOnWhitespacePunctuation(teststring) self.assertEqual(unigrams, testtokenize.getUnigrams())
def test_BothUnigramsBigramsApplyStopwords(self): # set up teststring = 'I like cats and birds.' both = ['cats', 'birds', 'cats birds'] # test testtokenize = TokenizeOnWhitespacePunctuation(teststring, applyStopwords=True) self.assertEqual(both, testtokenize.getBothUnigramsBigrams())
def test_TokenizeOnWhitespacePunctuationBothUnigramsBigrams(self): # set up teststring = 'I like cats and birds.' both = ['i', 'like', 'cats', 'and', 'birds', 'i like', 'like cats', 'cats and', 'and birds'] # test testtokenize = TokenizeOnWhitespacePunctuation(teststring) self.assertEqual(both, testtokenize.getBothUnigramsBigrams())
def test_TokenizeOnWhitespacePunctuationBothUnigramsBigrams(self): # set up teststring = 'I like cats and birds.' both = [ 'i', 'like', 'cats', 'and', 'birds', 'i like', 'like cats', 'cats and', 'and birds' ] # test testtokenize = TokenizeOnWhitespacePunctuation(teststring) self.assertEqual(both, testtokenize.getBothUnigramsBigrams())
def getNgrams(text, getUnigrams=True, getBigrams=True, getTrigrams=False): unigrams = [] bigrams = [] trigrams = [] sentences = TokenizeIntoSentences().doTokenize(text) for sentence in sentences: sentenceUnigrams = TokenizeOnWhitespacePunctuation( sentence, keepCaps=False, applyStopwords=True).getUnigrams() if getUnigrams: for sentenceUnigram in sentenceUnigrams: unigrams.append(sentenceUnigram) if getBigrams: sentenceBigrams = [ '%s %s' % (sentenceUnigrams[i], sentenceUnigrams[i + 1]) for i in range(len(sentenceUnigrams) - 1) ] for sentenceBigram in sentenceBigrams: bigrams.append(sentenceBigram) if getTrigrams: sentenceTrigrams = [ '%s %s %s' % (sentenceUnigrams[i], sentenceUnigrams[i + 1], sentenceUnigrams[i + 2]) for i in range(len(sentenceUnigrams) - 2) ] for sentenceTrigram in sentenceTrigrams: trigrams.append(sentenceTrigram) ngramsList = [unigrams, bigrams, trigrams] return ngramsList