Пример #1
0
    def extract(self, document):
        """Return keywords"""
        # tokenize text
        sentences = stringUtils.sent_tokenize(document)
        tokens = [stringUtils.word_tokenize(s) for s in sentences]

        #TODO: need to pos tag words for picking only nouns
        #TODO: need to stem tokens for improving accuracy
        ratings = self.compute_ratings(tokens)
        result = self.pick_keywords(ratings, 5)

        return tuple(result)
Пример #2
0
    def test_word_tokenize_with_stem(self):
        """Does it successfully tokenize words with stem option?"""        
        input_text = "crying buying"

        expected = (
            "cry",
            "buy"
        )

        result = stringUtils.word_tokenize(input_text, filter_stopwords=False, stem=True)

        self.assertTupleEqual(expected, result)
Пример #3
0
    def summarize(self, document, summaryLength):
        """Return a list of sentences"""
        # tokenize text
        sentences = stringUtils.sent_tokenize(document)
        tokens = [stringUtils.word_tokenize(s, stem=True) for s in sentences]

        cosine_matrix = self.compute_cosine(tokens, self._treshold)
        normalized_cosine_matrix = self.normalize_matrix(cosine_matrix)
        ratings = self.compute_ratings(normalized_cosine_matrix, self._epsilon)

        result = self.pick_best_sentences(sentences, ratings, summaryLength)

        return tuple(result)
Пример #4
0
    def test_word_tokenize(self):
        """Does it successfully tokenize words?"""        
        input_text = "This is a sample."

        expected = (
            "this",
            "is",
            "a",
            "sample"
        )

        result = stringUtils.word_tokenize(input_text, filter_stopwords=False, stem=False)

        self.assertTupleEqual(expected, result)      
Пример #5
0
    def test_word_tokenize_with_stopwords_filter(self):
        """Does it successfully tokenize words with stopwords filter option?"""        
        input_text = "How do you choose the article that's listed on the site."

        expected = (
            "choose",
            "article",
            "listed",
            "site"
        )

        result = stringUtils.word_tokenize(input_text, filter_stopwords=True)

        self.assertTupleEqual(expected, result)