示例#1
0
 def word_tokenize(self, sentences, include_punc=True):
     # : Do not process empty strings (Issue #3)
     if sentences.strip() == "":
         return []
     _tokens = sentences.split(" ")
     # : Handle strings consisting of a single punctuation mark seperately (Issue #4)
     if len(_tokens) == 1:
         if _tokens[0] in PUNCTUATION:
             if include_punc:
                 return _tokens
             else:
                 return []
     if include_punc:
         last_word = _tokens[-1]
         # Make sure that you do not separate '.' tokens into ['', '.']
         # (Issue #5)
         if last_word.endswith('.') and len(last_word) > 1:
             _tokens = _tokens[:-1] + [last_word[:-1], '.']
         return _tokens
     else:
         # Return each word token
         # Strips punctuation unless the word comes from a contraction
         # e.g. "gibt's" => ["gibt", "'s"] in "Heute gibt's viel zu tun!"
         # e.g. "hat's" => ["hat", "'s"]
         # e.g. "home." => ['home']
         words = [
             word if word.startswith("'") else strip_punc(
                 word,
                 all=False) for word in _tokens if strip_punc(
                 word,
                 all=False)]
         return list(words)
示例#2
0
def _get_document_tokens(document):
    if isinstance(document, basestring):
        tokens = set((strip_punc(w, all=False)
                    for w in word_tokenize(document, include_punc=False)))
    else:
        tokens = set(strip_punc(w, all=False) for w in document)
    return tokens
示例#3
0
 def word_tokenize(self, sentences, include_punc=True):
     #: Do not process empty strings (Issue #3)
     if sentences.strip() == "":
         return []
     _tokens = sentences.split(" ")
     #: Handle strings consisting of a single punctuation mark seperately (Issue #4)
     if len(_tokens) == 1:
         if _tokens[0] in PUNCTUATION:
             if include_punc:
                 return _tokens
             else:
                 return []
     if include_punc:
         last_word = _tokens[-1]
         # Make sure that you do not separate '.' tokens into ['', '.']
         # (Issue #5)
         if last_word.endswith('.') and len(last_word) > 1:
             _tokens = _tokens[:-1] + [last_word[:-1], '.']
         return _tokens
     else:
         # Return each word token
         # Strips punctuation unless the word comes from a contraction
         # e.g. "gibt's" => ["gibt", "'s"] in "Heute gibt's viel zu tun!"
         # e.g. "hat's" => ["hat", "'s"]
         # e.g. "home." => ['home']
         words = [
             word if word.startswith("'") else strip_punc(
                 word,
                 all=False) for word in _tokens if strip_punc(
                 word,
                 all=False)]
         return list(words)
示例#4
0
def _get_document_tokens(document):
    if isinstance(document, basestring):
        tokens = set((strip_punc(w, all=False)
                      for w in word_tokenize(document, include_punc=False)))
    else:
        tokens = set(strip_punc(w, all=False) for w in document)
    return tokens
示例#5
0
def contains_extractor(document):
    '''A basic document feature extractor that returns a dict of words that
    the document contains.
    '''
    if isinstance(document, basestring):
        tokens = set([strip_punc(w, all=False)
                    for w in word_tokenize(document, include_punc=False)])
    else:
        tokens = set((strip_punc(w, all=False) for w in document))
    features = dict((u'contains({0})'.format(w), True) for w in tokens)
    return features
示例#6
0
def contains_extractor(document):
    '''A basic document feature extractor that returns a dict of words that
    the document contains.
    '''
    tokenizer = WordTokenizer()
    if isinstance(document, basestring):
        tokens = set([strip_punc(w, all=False)
                    for w in tokenizer.itokenize(document, include_punc=False)])
    else:
        tokens = set((strip_punc(w, all=False) for w in document))
    features = dict((u'contains({0})'.format(w), True) for w in tokens)
    return features
def _get_document_tokens(document):
    if isinstance(document, basestring):
        tokens = set((strip_punc(w, all=False) for w in word_tokenize(document, include_punc=False)))
    else:
        tokens = set(strip_punc(w, all=False) for w in document)
        return tokens[docs]

    def basic_extractor(document, train_set):
        word_features = _get_words_from_dataset(train_set)
        tokens = _get_document_tokens(document)
        features = dict(((u'contains({0})'.format(word), (word in tokens)) for word in word_features))
        return features
示例#8
0
def basic_extractor(document, train_set):
    '''A basic document feature extractor that returns a dict indicating
    what words in ``train_set`` are contained in ``document``.

    :param document: The text to extract features from. Can be a string or an iterable.
    :param train_set: Training data set, a list of tuples of the form
        ``(words, label)``.
    '''
    word_features = _get_words_from_dataset(train_set)
    if isinstance(document, basestring):
        tokens = set((strip_punc(w, all=False)
                      for w in word_tokenize(document, include_punc=False)))
    else:
        tokens = set(strip_punc(w, all=False) for w in document)
    features = dict(((u'contains({0})'.format(word), (word in tokens))
                     for word in word_features))
    return features
示例#9
0
def basic_extractor(document, train_set):
    '''A basic document feature extractor that returns a dict indicating
    what words in ``train_set`` are contained in ``document``.

    :param document: The text to extract features from. Can be a string or an iterable.
    :param train_set: Training data set, a list of tuples of the form
        ``(words, label)``.
    '''
    word_features = _get_words_from_dataset(train_set)
    if isinstance(document, basestring):
        tokens = set((strip_punc(w, all=False)
                    for w in word_tokenize(document, include_punc=False)))
    else:
        tokens = set(strip_punc(w, all=False) for w in document)
    features = dict(((u'contains({0})'.format(word), (word in tokens))
                                            for word in word_features))
    return features
示例#10
0
    def tokenize(self, text, include_punc=True):
        '''Return a list of word tokens.

        :param text: string of text.
        :param include_punc: (optional) whether to include punctuation as separate tokens. Default to True.
        '''
        tokens = nltk.tokenize.word_tokenize(text)
        if include_punc:
            return tokens
        else:
            # Return each word token
            # Strips punctuation unless the word comes from a contraction
            # e.g. "Let's" => ["Let", "'s"]
            # e.g. "Can't" => ["Ca", "n't"]
            # e.g. "home." => ['home']
            return [word if word.startswith("'") else strip_punc(word, all=False)
                    for word in tokens if strip_punc(word, all=False)]
示例#11
0
    def tokenize(self, text, include_punc=True):
        '''Return a list of word tokens.

        :param text: string of text.
        :param include_punc: (optional) whether to include punctuation as separate tokens. Default to True.
        '''
        tokens = nltk.tokenize.word_tokenize(text)
        if include_punc:
            return tokens
        else:
            # Return each word token
            # Strips punctuation unless the word comes from a contraction
            # e.g. "Let's" => ["Let", "'s"]
            # e.g. "Can't" => ["Ca", "n't"]
            # e.g. "home." => ['home']
            return [
                word if word.startswith("'") else strip_punc(word, all=False)
                for word in tokens if strip_punc(word, all=False)
            ]
示例#12
0
    def word_tokenize(self, text, include_punc=True):
        """The Treebank tokenizer uses regular expressions to tokenize text as
        in Penn Treebank.

        It assumes that the text has already been segmented into sentences,
        e.g. using ``self.sent_tokenize()``.

        This tokenizer performs the following steps:

        - split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
        - treat most punctuation characters as separate tokens
        - split off commas and single quotes, when followed by whitespace
        - separate periods that appear at the end of line

        Source: NLTK's docstring of ``TreebankWordTokenizer`` (accessed: 02/10/2014)

        """
        #: Do not process empty strings (Issue #3)
        if text.strip() == "":
            return []
        _tokens = self.word_tok.tokenize(text)
        #: Handle strings consisting of a single punctuation mark seperately (Issue #4)
        if len(_tokens) == 1:
            if _tokens[0] in PUNCTUATION:
                if include_punc:
                    return _tokens
                else:
                    return []
        if include_punc:
            return _tokens
        else:
            # Return each word token
            # Strips punctuation unless the word comes from a contraction
            # e.g. "gibt's" => ["gibt", "'s"] in "Heute gibt's viel zu tun!"
            # e.g. "hat's" => ["hat", "'s"]
            # e.g. "home." => ['home']
            words = [
                word if word.startswith("'") else strip_punc(
                    word,
                    all=False) for word in _tokens if strip_punc(
                    word,
                    all=False)]
            return list(words)
示例#13
0
def noun_phrases():
    text = get_text(request)
    noun_phrases = set(TextBlob(text).noun_phrases)
    # Strip punctuation from ends of noun phrases and exclude long phrases
    stripped = [strip_punc(np) for np in noun_phrases if len(np.split()) <= 5]
    return jsonify({"result": stripped})
import nltk
nltk.download('stopwords', download_dir='.')

from nltk.corpus import stopwords
nltk.data.path.append('.')
stop_words = stopwords.words('english')

from textblob.utils import strip_punc
tokenized = sc.textFile('wasb:///example/data/RomeoAndJuliet.txt')\
              .map(lambda line: strip_punc(line, all=True).lower())\
              .flatMap(lambda line: line.split())

filtered = tokenized.filter(lambda word: word not in stop_words)

from operator import add
word_counts = filtered.map(lambda word: (word, 1)).reduceByKey(add)

filtered_counts = word_counts.filter(lambda item: item[1] >= 60)

from operator import itemgetter
sorted_items = sorted(filtered_counts.collect(), 
                      key=itemgetter(1), reverse=True)

max_len = max([len(word) for word, count in sorted_items])
for word, count in sorted_items:
    print('{:>{width}}: {}'.format(word, count, width=max_len))
示例#15
0
 def test_strip_punc_all(self):
     assert_equal(strip_punc(self.text, all=True),
                 'this Has Punctuation')
示例#16
0
 def test_strip_punc(self):
     assert_equal(strip_punc(self.text),
                 'this. Has. Punctuation')
示例#17
0
 def test_strip_punc(self):
     assert_equal(strip_punc(self.text), 'this. Has. Punctuation')
示例#18
0
 def test_strip_punc_all(self):
     assert_equal(strip_punc(self.text, all=True), 'this Has Punctuation')