def initialize(self): self.tagger = Blobber(pos_tagger=PatternTagger()) parts_of_speech = ['DT', 'NN', 'VBZ', 'TO', 'VB', 'CD', 'POS', 'JJ', 'CC', 'IN', 'PRP', 'VBG', 'RB', 'JJR', 'NNS', 'MD'] self.pos_to_idx = {} for i,pos in enumerate(parts_of_speech): self.pos_to_idx[pos] = i+self.index_offset self.rare_idx = max(self.pos_to_idx.values())+1
def test_passing_bad_init_params(self): tagger = PatternTagger() assert_raises(ValueError, lambda: tb.TextBlob("blah", parser=tagger)) assert_raises(ValueError, lambda: tb.TextBlob("blah", np_extractor=tagger)) assert_raises(ValueError, lambda: tb.TextBlob("blah", tokenizer=tagger)) assert_raises(ValueError, lambda: tb.TextBlob("blah", analyzer=tagger)) analyzer = PatternAnalyzer assert_raises(ValueError, lambda: tb.TextBlob("blah", pos_tagger=analyzer))
def __init__(self, path): self.Documents = [] self.allowed = set([chr(i) for i in xrange(ord('a'), ord('z')+1)]+ \ [chr(i) for i in xrange(ord('A'), ord('Z')+1)] + \ #[',','-',' '] + [str(i) for i in xrange(10)]) [',','.','?','-','!',' '] + [str(i) for i in xrange(10)]) self.punctuation = [';', ':', '&', '?', "/"] self.P = Partition(self.punctuation) self.tagger = PatternTagger() self.sw = StopWords() with open(path, 'r') as f: for line in f: line = line.strip() if line: self.Documents.append(line)
class ConllExtractor(BaseNPExtractor): '''A noun phrase extractor that uses chunk parsing trained with the ConLL-2000 training corpus. ''' POS_TAGGER = PatternTagger() # The context-free grammar with which to filter the noun phrases CFG = { ('NNP', 'NNP'): 'NNP', ('NN', 'NN'): 'NNI', ('NNI', 'NN'): 'NNI', ('JJ', 'JJ'): 'JJ', ('JJ', 'NN'): 'NNI', } # POS suffixes that will be ignored INSIGNIFICANT_SUFFIXES = ['DT', 'CC', 'PRP$', 'PRP'] def __init__(self, parser=None): self.parser = ChunkParser() if not parser else parser def extract(self, text): '''Return a list of noun phrases (strings) for body of text.''' sentences = nltk.tokenize.sent_tokenize(text) noun_phrases = [] for sentence in sentences: parsed = self._parse_sentence(sentence) # Get the string representation of each subtree that is a # noun phrase tree phrases = [ _normalize_tags( filter_insignificant(each, self.INSIGNIFICANT_SUFFIXES)) for each in parsed if isinstance(each, nltk.tree.Tree) and each.label() == 'NP' and len(filter_insignificant(each)) >= 1 and _is_match(each, cfg=self.CFG) ] nps = [tree2str(phrase) for phrase in phrases] noun_phrases.extend(nps) return noun_phrases def _parse_sentence(self, sentence): '''Tag and parse a sentence (a plain, untagged string).''' tagged = self.POS_TAGGER.tag(sentence) return self.parser.parse(tagged)
def __init__(self, path): data_home = os.path.split(path)[0] self.Documents = [] self.allowed = set([chr(i) for i in xrange(ord('a'), ord('z')+1)]+ \ [chr(i) for i in xrange(ord('A'), ord('Z')+1)] + \ #[',','-',' '] + [str(i) for i in xrange(10)]) [',','.','?','-','!',' '] + [str(i) for i in xrange(10)]) punctuation = [';', ':', '&', '?', "/"] #P = Partition(punctuation) self.tagger = PatternTagger() with open(path, 'r') as f: for line in f.readlines(): li = line.split("\t")[1].strip() if li: self.Documents.append(li) data_Inter_path = os.path.join(data_home, "Intermediate") self.inter = data_Inter_path self.P = Partition(punctuation, data_Inter_path, data_home) self.sw = StopWords(data_home)
def iws(self, text): verbs = re.compile('VB*') nouns = re.compile('NN*') adjectives = re.compile('JJ*') adverbs = re.compile('RB*') blob = TextBlob(text, pos_tagger=PatternTagger()) tags = blob.pos_tags for index, t in enumerate(tags): if t[1] == 'UH': if (index < len(tags) - 1) and ( adjectives.match(tags[index + 1][1]) or adverbs.match(tags[index + 1][1])): return 1 for i, next_tag in enumerate(tags[index + 1:]): if (i < len(tags[index + 1:]) - 1) and ( (adverbs.match(next_tag[1]) and adjectives.match(tags[index + 1 + i + 1][1])) or (adjectives.match(next_tag[1]) and nouns.match(tags[index + 1 + i + 1][1])) or (adverbs.match(next_tag[1]) and verbs.match(tags[index + 1 + i + 1][1]))): return 1 return 0 return None
def __init__(self): self.tag_stack = [] self.ignore_data = False self.parsed_text = '' self.blobber = Blobber(parser=PatternParser(), pos_tagger=PatternTagger()) HTMLParser.__init__(self)
# -*- coding: utf-8 -*- import re from nltk import RegexpParser from textblob import TextBlob, Word from textblob.taggers import PatternTagger from textblob.sentiments import NaiveBayesAnalyzer from maicroft.words.utility_text_sets import stopwords from maicroft.words import utility_text_sets pattern_tagger = PatternTagger() naive_bayes_analyzer = NaiveBayesAnalyzer() NOUN = "n" VERB = "v" ADV = "r" ADJ = "a" class TextParser: """ Utility class for processing text content. """ substitutions = utility_text_sets.substitutions # Skip if any of these is the *only* attribute - for instance, # "I'm a big fan of Queen" makes sense, but "I'm a fan" doesn't. skip_lone_attributes = [
class Blobber(object): '''A factory for TextBlobs that all share the same tagger, tokenizer, parser, classifier, and np_extractor. Usage: >>> from textblob import Blobber >>> from textblob.taggers import NLTKTagger >>> from textblob.tokenizers import SentenceTokenizer >>> tb = Blobber(pos_tagger=NLTKTagger(), tokenizer=SentenceTokenizer()) >>> blob1 = tb("This is one blob.") >>> blob2 = tb("This blob has the same tagger and tokenizer.") >>> blob1.pos_tagger is blob2.pos_tagger True :param tokenizer: (optional) A tokenizer instance. If ``None``, defaults to :class:`WordTokenizer() <textblob.tokenizers.WordTokenizer>`. :param np_extractor: (optional) An NPExtractor instance. If ``None``, defaults to :class:`FastNPExtractor() <textblob.en.np_extractors.FastNPExtractor>`. :param pos_tagger: (optional) A Tagger instance. If ``None``, defaults to :class:`PatternTagger <textblob.en.taggers.PatternTagger>`. :param analyzer: (optional) A sentiment analyzer. If ``None``, defaults to :class:`PatternAnalyzer <textblob.en.sentiments.PatternAnalyzer>`. :param parser: A parser. If ``None``, defaults to :class:`PatternParser <textblob.en.parsers.PatternParser>`. :param classifier: A classifier. .. versionadded:: 0.4.0 ''' np_extractor = FastNPExtractor() pos_tagger = PatternTagger() tokenizer = WordTokenizer() analyzer = PatternAnalyzer() parser = PatternParser() def __init__(self, tokenizer=None, pos_tagger=None, np_extractor=None, analyzer=None, parser=None, classifier=None): _initialize_models(self, tokenizer, pos_tagger, np_extractor, analyzer, parser, classifier) def __call__(self, text): '''Return a new TextBlob object with this Blobber's ``np_extractor``, ``pos_tagger``, ``tokenizer``, ``analyzer``, and ``classifier``. :returns: A new TextBlob. ''' return TextBlob(text, tokenizer=self.tokenizer, pos_tagger=self.pos_tagger, np_extractor=self.np_extractor, analyzer=self.analyzer, parser=self.parser, classifier=self.classifier) def __repr__(self): classifier_name = self.classifier.__class__.__name__ + "()" if self.classifier else "None" return ("Blobber(tokenizer={0}(), pos_tagger={1}(), " "np_extractor={2}(), analyzer={3}(), parser={4}(), classifier={5})")\ .format(self.tokenizer.__class__.__name__, self.pos_tagger.__class__.__name__, self.np_extractor.__class__.__name__, self.analyzer.__class__.__name__, self.parser.__class__.__name__, classifier_name) __str__ = __repr__
class BaseBlob(StringlikeMixin, BlobComparableMixin): '''An abstract base class that all textblob classes will inherit from. Includes words, POS tag, NP, and word count properties. Also includes basic dunder and string methods for making objects like Python strings. :param text: A string. :param tokenizer: (optional) A tokenizer instance. If ``None``, defaults to :class:`WordTokenizer() <textblob.tokenizers.WordTokenizer>`. :param np_extractor: (optional) An NPExtractor instance. If ``None``, defaults to :class:`FastNPExtractor() <textblob.en.np_extractors.FastNPExtractor>`. :param pos_tagger: (optional) A Tagger instance. If ``None``, defaults to :class:`PatternTagger <textblob.en.taggers.PatternTagger>`. :param analyzer: (optional) A sentiment analyzer. If ``None``, defaults to :class:`PatternAnalyzer <textblob.en.sentiments.PatternAnalyzer>`. :param parser: A parser. If ``None``, defaults to :class:`PatternParser <textblob.en.parsers.PatternParser>`. :param classifier: A classifier. .. versionchanged:: 0.6.0 ``clean_html`` parameter deprecated, as it was in NLTK. ''' np_extractor = FastNPExtractor() pos_tagger = PatternTagger() tokenizer = WordTokenizer() translator = Translator() analyzer = PatternAnalyzer() parser = PatternParser() def __init__(self, text, tokenizer=None, pos_tagger=None, np_extractor=None, analyzer=None, parser=None, classifier=None, clean_html=False): if not isinstance(text, basestring): raise TypeError('The `text` argument passed to `__init__(text)` ' 'must be a string, not {0}'.format(type(text))) if clean_html: raise NotImplementedError( "clean_html has been deprecated. " "To remove HTML markup, use BeautifulSoup's " "get_text() function") self.raw = self.string = text self.stripped = lowerstrip(self.raw, all=True) _initialize_models(self, tokenizer, pos_tagger, np_extractor, analyzer, parser, classifier) @cached_property def words(self): '''Return a list of word tokens. This excludes punctuation characters. If you want to include punctuation characters, access the ``tokens`` property. ''' return WordList(word_tokenize(self.raw, include_punc=False)) @cached_property def tokens(self): '''Return a list of tokens, using this blob's tokenizer object (defaults to :class:`WordTokenizer <textblob.tokenizers.WordTokenizer>`). ''' return WordList(self.tokenizer.tokenize(self.raw)) def tokenize(self, tokenizer=None): '''Return a list of tokens, using ``tokenizer``. :param tokenizer: (optional) A tokenizer object. If None, defaults to this blob's default tokenizer. ''' t = tokenizer if tokenizer is not None else self.tokenizer return WordList(t.tokenize(self.raw)) def parse(self, parser=None): '''Parse the text. :param parser: (optional) A parser instance. If ``None``, defaults to this blob's default parser. .. versionadded:: 0.6.0 ''' p = parser if parser is not None else self.parser return p.parse(self.raw) def classify(self): '''Classify the blob using the blob's ``classifier``.''' if self.classifier is None: raise NameError("This blob has no classifier. Train one first!") return self.classifier.classify(self.raw) @cached_property def sentiment(self): '''Return a tuple of form (polarity, subjectivity ) where polarity is a float within the range [-1.0, 1.0] and subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective. :rtype: tuple ''' return self.analyzer.analyze(self.raw) @cached_property def polarity(self): '''Return the polarity score as a float within the range [-1.0, 1.0] :rtype: float ''' return PatternAnalyzer().analyze(self.raw)[0] @cached_property def subjectivity(self): '''Return the subjectivity score as a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective. :rtype: float ''' return PatternAnalyzer().analyze(self.raw)[1] @cached_property def noun_phrases(self): '''Returns a list of noun phrases for this blob.''' return WordList([ phrase.strip().lower() for phrase in self.np_extractor.extract(self.raw) if len(phrase) > 1 ]) @cached_property def pos_tags(self): '''Returns an list of tuples of the form (word, POS tag). Example: :: [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'), ('Thursday', 'NNP'), ('morning', 'NN')] :rtype: list of tuples ''' return [(Word(word, pos_tag=t), unicode(t)) for word, t in self.pos_tagger.tag(self.raw) if not PUNCTUATION_REGEX.match(unicode(t))] tags = pos_tags @cached_property def word_counts(self): '''Dictionary of word frequencies in this text. ''' counts = defaultdict(int) stripped_words = [lowerstrip(word) for word in self.words] for word in stripped_words: counts[word] += 1 return counts @cached_property def np_counts(self): '''Dictionary of noun phrase frequencies in this text. ''' counts = defaultdict(int) for phrase in self.noun_phrases: counts[phrase] += 1 return counts def ngrams(self, n=3): '''Return a list of n-grams (tuples of n successive words) for this blob. ''' if n <= 0: return [] grams = [ WordList(self.words[i:i + n]) for i in range(len(self.words) - n + 1) ] return grams def translate(self, from_lang=None, to="en"): '''Translate the blob to another language. Uses the Google Translate API. Returns a new TextBlob. Requires an internet connection. Usage: :: >>> b = TextBlob("Simple is better than complex") >>> b.translate(to="es") TextBlob('Lo simple es mejor que complejo') Language code reference: https://developers.google.com/translate/v2/using_rest#language-params .. versionadded:: 0.5.0. :param from_lang: Language to translate from. If ``None``, will attempt to detect the language. :param to: Language to translate to. :rtype: BaseBlob ''' if from_lang is None: from_lang = self.translator.detect(self.string) return self.__class__( self.translator.translate(self.raw, from_lang=from_lang, to_lang=to)) def detect_language(self): '''Detect the blob's language using the Google Translate API. Requires an internet connection. Usage: :: >>> b = TextBlob("bonjour") >>> b.detect_language() u'fr' Language code reference: https://developers.google.com/translate/v2/using_rest#language-params .. versionadded:: 0.5.0 :rtype: str ''' return self.translator.detect(self.raw) def correct(self): '''Attempt to correct the spelling of a blob. .. versionadded:: 0.6.0 :rtype: BaseBlob ''' # regex matches: contraction or word or punctuation or whitespace tokens = nltk.tokenize.regexp_tokenize(self.raw, "\w*('\w*)+|\w+|[^\w\s]|\s") corrected = (Word(w).correct() for w in tokens) ret = ''.join(corrected) return self.__class__(ret) def _cmpkey(self): '''Key used by ComparableMixin to implement all rich comparison operators. ''' return self.raw def _strkey(self): '''Key used by StringlikeMixin to implement string methods.''' return self.raw def __hash__(self): return hash(self._cmpkey()) def __add__(self, other): '''Concatenates two text objects the same way Python strings are concatenated. Arguments: - `other`: a string or a text object ''' if isinstance(other, basestring): return self.__class__(self.raw + other) elif isinstance(other, BaseBlob): return self.__class__(self.raw + other.raw) else: raise TypeError( 'Operands must be either strings or {0} objects'.format( self.__class__.__name__)) def split(self, sep=None, maxsplit=sys.maxsize): """Behaves like the built-in str.split() except returns a WordList. """ return WordList(self._strkey().split(sep, maxsplit))