def initialize_document(self, doc, docs_list_mode=False): if not docs_list_mode: self.doc = doc.lower() self.blob = TextBlob(text=self.doc, tokenizer=self.tokenizer) self.tokens = copy.deepcopy(self.blob.tokens) self.bigrams = self.bigramify(self.blob.tokens) self.tokens.extend(self.bigrams) self.trigrams = self.trigramify(self.blob.tokens) self.tokens.extend(self.trigrams) else: doc = doc.lower() blob = TextBlob(text=doc, tokenizer=self.tokenizer) tokens = copy.deepcopy(blob.tokens) bigram = self.bigramify(tokens=tokens) tokens.extend(bigram) trigram = self.trigramify(tokens=tokens) tokens.extend(trigram) return tokens
def save(self, *args, **kwargs): from goose import Goose from text.blob import TextBlob g = Goose() article = g.extract(url=self.url) try: b = TextBlob(article.title) lang = b.detect_language() except: lang='en' g = Goose({'use_meta_language': False, 'target_language':lang, 'paper_class':'soup'}) if not self.title: self.title = article.title if not self.newspaper: self.newspaper = article.domain if not self.content: self.content = article.cleaned_text try: if article.top_image.src: layout = Photo() #layout.photo = "images/news/"+str(self.id)+".jpg" layout.url = article.top_image.src layout.article = self layout.save() except: pass super(Article, self).save()
def save(self): if not self.content: articles = self.article.all() content = '' for a in articles: content += a.content + '\r' self.content = content content = content.replace(u'–', '-') content = content.replace(u'“', '"') content = content.replace(u'”', '"') content = content.replace(u'’', "'") content = content.replace('[\d]', "") if not self.summary: self.summary = get_summary(self.content, self.compression) from text.blob import TextBlob try: b = TextBlob(self.content.split('\n', 1)[0]) lang = b.detect_language() except: lang='en' o = ots.OTS(lang, self.compression) filename = u'text'+str(self.id)+'.txt' f = open(filename, 'w') f.write(self.content.encode("utf-8")) f.close() o.parse(filename, 60) try: os.remove(filename) except: pass self.summary = str(o) if not self.spin_summary: self.spin_summary = get_text_synonymizer(self.summary) super(Syntesis, self).save()
def test_tag_blob(self): blob = TextBlob(self.text, pos_tagger=self.tagger) tags = blob.tags logging.debug("tags: {0}".format(tags)) words = self.text.split() for i, word_tag in enumerate(tags): assert_equal(word_tag[0], words[i])
def sentences_sentiment(): text = get_text(request) blob = TextBlob(text) sentences = [{ "sentence": unicode(s), "sentiment": s.sentiment[0] } for s in blob.sentences] return jsonify({"result": sentences})
def sentence_to_words(sentence): """ Converts passed sentences into a list of words. Returns all words but stop words. """ blob = TextBlob(sentence) return [ word.lower() for word in blob.words if word not in stopwords.words('english') ]
def cache_sentences(self): self.cached = True self.cache_list = [] for key, value in self.__dict__.items(): if key not in filterTag: try: blob = TextBlob(value) for sentence in blob.sentences: self.cache_list.append(sentence) except Exception as e: logger.debug("textblob error| %s:%s" % (key, value))
def create_blob(self, request_json): options = {} if request_json.get('analyzer') == 'NaiveBayesAnalyzer': options['analyzer'] = self.naive_bayes_analyzer if request_json.get('np_extractor') == 'ConllExtractor': options['np_extractor'] = self.conll_extractor if request_json.get('pos_tagger') == 'NLTKTagger': options['pos_tagger'] = self.nltk_tagger elif request_json.get('pos_tagger') == 'PerceptronTagger': options['pos_tagger'] = self.perceptron_tagger return TextBlob(request_json['text'], **options)
def one_sentence_from(self, quote): """Reduce the given quote to a single sentence. The choice is biased against the first sentence, which is less likely to be the start of a real in-text sentence. """ blob = TextBlob(quote) try: sentences = blob.sentences except Exception, e: # TextBlob can't parse this. Just return the whole string return quote
def truncate_at_stopword(self, string): # Truncate a string at the last stopword not preceded by # another stopword. # print "%s =>" % string if type(string) == Sentence: words = string.words else: try: words = TextBlob(string).sentences except Exception, e: # TextBlob can't parse this. Just return the whole string return string
def to_sentences(self): if self.cached: for sentence in self.cache_list: yield sentence else: for key, value in self.__dict__.items(): if key not in filterTag: try: blob = TextBlob(value) for sentence in blob.sentences: yield sentence except Exception as e: logger.debug("textblob error| %s:%s" % (key, value))
def freq(self, word, docs=None): if docs is None: return self.tokens.count(word) else: if not isinstance(docs, str): d = "" for item in docs: d = "%s %s" % (d, item) docs = d blob = TextBlob(text=docs, tokenizer=self.tokenizer) blob.tokens.extend(self.bigramify(blob)) blob.tokens.extend(self.trigramify(blob)) return blob.tokens.count(word)
def __init__(self): # create custom components self.naive_bayes_analyzer = NaiveBayesAnalyzer() self.conll_extractor = ConllExtractor() self.nltk_tagger = NLTKTagger() self.perceptron_tagger = PerceptronTagger() if DEV_ENV: return # train all components (default and custom) text = 'TextBlob blobs great!' default_blob = TextBlob(text) default_blob.sentiment default_blob.noun_phrases default_blob.pos_tags custom_blob = TextBlob(text, analyzer=self.naive_bayes_analyzer, np_extractor=self.conll_extractor, pos_tagger=self.nltk_tagger) custom_blob.sentiment custom_blob.noun_phrases custom_blob.pos_tags custom2_blob = TextBlob(text, pos_tagger=self.perceptron_tagger) custom2_blob.pos_tags
def rate(cls, s, base_score=1.0, frequencies=None, obscurity_cutoff=None): "Rate a string's suitability as an _ebook quote." s = s.strip() score = float(base_score) # print s # print " Starting rating: %.2f" % score # People like very short or very long quotes. # if len(s) < 40: # score *= 2 if len(s) > 128: score *= 2 # print " Length bonus: %.2f" % score blob = TextBlob(s.decode("utf8")) try: words = blob.words except Exception, e: # TODO: I'm sick of trying to get TextBlob to parse # strings that include things like ". . . ". Just return # the current score. return score
def quotes_in(self, paragraph): para = textwrap.wrap(paragraph, self.wrap_at) if len(para) == 0: return probability = self.probability if para[0][0].upper() == para[0][0]: # We greatly prefer lines that start with capital letters. probability *= 5 else: probability /= 4 gathering = False in_progress = None last_yield = None for i in range(len(para)): line = para[i] if gathering: # We are currently putting together a quote. done = False if (random.random() < self.truncate_chance and len(in_progress) >= self.minimum_quote_size): # Yield a truncated quote. done = True else: potential = in_progress + ' ' + line.strip() if len(potential) >= self.maximum_quote_size: # That would be too long. We're done. done = True else: in_progress = potential if done: quote = in_progress in_progress = None gathering = done = False # Miscellaneous tweaks to increase the chance that # the quote will be funny. if random.random() < 0.6: quote = self.one_sentence_from(quote) if random.random() < 0.4: quote = self.truncate_at_stopword(quote) # Quotes that end with two consecutive stopwords # are not funny. It would be best to parse every # single quote and make sure it doesn't end with # two consecutive stopwords. But in practice it's # much faster to just check for the biggest # offenders, which all end in 'the', and then trim # the 'the'. low = quote.lower() for stopwords in ('of the', 'in the', 'and the', 'in the', 'on the', 'for the'): if low.endswith(stopwords): quote = quote[:len(" the")-1] break quote = unicode(quote) quote = self.remove_ending_punctuation(quote) quote = self.remove_beginning_punctuation(quote) if random.random() > 0.75: quote = self.truncate_to_common_word(quote) if (len(quote) >= self.minimum_quote_size and len(quote) <= self.maximum_quote_size and self.ONE_LETTER.search(quote)): yield quote last_yield = quote continue else: # We are not currently gathering a quote. Should we # be? r = random.random() if random.random() < probability: # Run the regular expression and see if it matches. m = self.SEVERAL_CAPITALIZED_WORDS.search(line) if m is not None: phrase = m.groups()[0] if "Gutenberg" in phrase or "Proofreader" in phrase: # Part of the meta, not part of text. continue # Tag the text to see if it's a proper noun. blob = TextBlob(phrase) tags = blob.tags proper_nouns = [x for x, tag in tags if tag.startswith('NNP')] if len(proper_nouns) < len(tags) / 3.0: # We're good. yield phrase continue matches = self._line_matches(line) if matches or random.random() < probability: gathering = True if matches: # A keyword match! Start gathering a quote either # at this line or some earlier line. maximum_backtrack = ( self.maximum_quote_size / self.wrap_at) - 1 backtrack = random.randint(0, maximum_backtrack) start_at = max(0, i - backtrack) in_progress = " ".join( [x.strip() for x in para[start_at:i+1]]) else: in_progress = line.strip()
def test_upper(self): blob = TextBlob('lorem ipsum') assert_true(is_blob(blob.upper())) assert_equal(blob.upper(), TextBlob('LOREM IPSUM'))
def test_upper_and_words(self): blob = TextBlob('beautiful is better') assert_equal(blob.upper().words, WordList(['BEAUTIFUL', 'IS', 'BETTER' ]))
def test_index(self): blob = TextBlob(self.text) assert_equal(blob.index('Namespaces'), self.text.index('Namespaces'))
def test_strip_and_words(self): blob = TextBlob('Beautiful is better! ') assert_equal(blob.strip().words, WordList(['Beautiful', 'is', 'better' ]))
def test_lower(self): blob = TextBlob('Lorem Ipsum') assert_true(is_blob(blob.lower())) assert_equal(blob.lower(), TextBlob('lorem ipsum'))
# -*- coding: utf-8 -*- """ Created on Fri Oct 4 09:44:50 2013 @author: ozdemircili """ from text.blob import TextBlob text = TextBlob( "Once upon a time a there was a program called Pycheat.It was one of the cheats" ) text.tags text.noun_phrases text.sentiment text.words text.sentences text.title text.words[-1].singularize() text.words[3].pluralize() from text.blob import Word from text.blob import Verb
def text(self, tweetObject): analysis = TextBlob(tweetObject) print("HELLO") print(analysis.sentiment)
def test_split(self): blob = TextBlob('Beautiful is better') assert_equal(blob.split(), WordList(['Beautiful', 'is', 'better']))
def test_endswith(self): blob = TextBlob(self.text) assert_true(blob.endswith('of those!')) assert_true(blob.ends_with('of those!'))
def test_startswith(self): blob = TextBlob(self.text) assert_true(blob.startswith('Beautiful')) assert_true(blob.starts_with('Beautiful'))
def test_rfind(self): text = 'Beautiful is better than ugly. ' blob = TextBlob(text) assert_equal(blob.rfind('better'), text.rfind('better'))
def test_find(self): text = 'Beautiful is better than ugly.' blob = TextBlob(text) assert_equal(blob.find('better', 5, len(blob)), text.find('better', 5, len(text)))
def classify(new_comment, bayes): '''takes a comment string (to be classified) and a trained bayes returns string 'pos' (normal) or 'neg' (crazy)''' analyze = TextBlob(new_comment,classifier = bayes) return analyze.classify()
def content_to_sentences(text): """ Converts passed text into a list of sentences. """ blob = TextBlob(text) return [str(sentence) for sentence in blob.sentences]
def noun_phrases(): text = get_text(request) noun_phrases = set(TextBlob(text).noun_phrases) # Strip punctuation from ends of noun phrases and exclude long phrases stripped = [strip_punc(np) for np in noun_phrases if len(np.split()) <= 5] return jsonify({"result": stripped})
def test_blob_analyze(self): pos_blob = TextBlob(self.pos, analyzer=self.analyzer) assert_true(pos_blob.sentiment[0] > 0.0) neg_blob = TextBlob(self.neg, analyzer=self.analyzer) assert_true(neg_blob.sentiment[0] < 0.0)
# writer.writerow(columns) global_row = 0 # xlsxwriter doesn't have a writerow(function), so we have to keep track of what row we're on columns = ['unit', 'id_article', 'position (+1)', 'unit_content', 'adjectives', 'verbs', 'article_title', 'article_content_no_tags', 'article_url'] workbook = xlsxwriter.Workbook(publication_prefix+'full_report.xlsx') # Create new spreadsheet worksheet = workbook.add_worksheet() # Make new worksheet for col in range(0, len(columns)): worksheet.write(global_row, col, columns[col]) global_row += 1 for row in ngo_mentions: # Loop through all rows in the database results # Use TextBlob to parse the article # blob.tags returns the following parts of speech (some are missing, like VBN, etc.): # noun (NN), adjective (JJ), determiner (DT), verb (VB), noun phrase (NP), # sentence subject (SBJ), and prepositional noun phrase (PNP) blob = TextBlob(row['article_content_no_tags']) # Split the article into paragraphs paragraphs = (re.split('(\n)+', row['article_content_no_tags'])) paragraphs = [paragraph for paragraph in paragraphs if paragraph != "\n"] paragraphs_lower = [paragraph.lower() for paragraph in paragraphs] # Add line numbers # enumerate(list, 1) results in (list1, 1), (list2, 2), etc. article_numbered = ['(' + str(paragraph[0]) + ') ' + paragraph[1] for paragraph in enumerate(paragraphs, 1)] csv_article = '\n'.join(article_numbered) # Get a list of all the paragraphs that mention one of the organizations paragraph_position = [i for i, x in enumerate(paragraphs_lower) if any(org.lower() in x for org in organizations)] # Split the article into sentences
] test = [ ('The beer was good.', 'pos'), ('I do not enjoy my job', 'neg'), ("I ain't feeling dandy today.", 'neg'), ("I feel amazing!", 'pos'), ('Gary is a friend of mine.', 'pos'), ("I can't believe I'm doing this.", 'neg') ] cl = NaiveBayesClassifier(train) # Classify some text print(cl.classify("Their burgers are amazing.")) # "pos" print(cl.classify("I don't like their pizza.")) # "neg" # Classify a TextBlob blob = TextBlob("The beer was amazing. But the hangover was horrible. " "My boss was not pleased.", classifier=cl) print(blob) print(blob.classify()) for sentence in blob.sentences: print(sentence) print(sentence.classify()) # Compute accuracy print("Accuracy: {0}".format(cl.accuracy(test))) # Show 5 most informative features cl.show_informative_features(5)
def test_title(self): blob = TextBlob('Beautiful is better') assert_equal(blob.title(), TextBlob('Beautiful Is Better'))
class WordGrapher(object): doc = "" blob = None docs = [] bigrams = None trigrams = None tokens = None tokenizer = None tfidf = None stopwords = [] try: stopwords.extend(nltk.corpus.stopwords.words('indonesian')) stopwords.extend(nltk.corpus.stopwords.words('english')) except IOError: pass def __init__(self, doc=None, docs=None): self.tokenizer = StopwordsTokenizer(stopwords=self.stopwords) if doc: self.set_document(doc=doc) if docs: self.set_documents(docs=docs) def set_document(self, doc, docs_list_mode=False): if doc: return self.initialize_document(doc=doc, docs_list_mode=docs_list_mode) else: raise ValueError("Document must not be None or empty") def set_documents(self, docs): if docs and isinstance(docs, list) and len(docs) > 0: self.docs = [self.set_document(doc=doc, docs_list_mode=True) for doc in docs] else: raise ValueError("Documents must not be None or and empty List") def initialize_document(self, doc, docs_list_mode=False): if not docs_list_mode: self.doc = doc.lower() self.blob = TextBlob(text=self.doc, tokenizer=self.tokenizer) self.tokens = copy.deepcopy(self.blob.tokens) self.bigrams = self.bigramify(self.blob.tokens) self.tokens.extend(self.bigrams) self.trigrams = self.trigramify(self.blob.tokens) self.tokens.extend(self.trigrams) else: doc = doc.lower() blob = TextBlob(text=doc, tokenizer=self.tokenizer) tokens = copy.deepcopy(blob.tokens) bigram = self.bigramify(tokens=tokens) tokens.extend(bigram) trigram = self.trigramify(tokens=tokens) tokens.extend(trigram) return tokens def bigramify(self, tokens, as_string=True): if as_string: return ["%s %s" % (words[0], words[1]) for words in bigrams(tokens)] else: return bigrams(tokens) def trigramify(self, tokens, as_string=True): if as_string: return ["%s %s %s" % (words[0], words[1], words[2]) for words in trigrams(tokens)] else: return trigrams(tokens) def ngrams(self, n): return self.blob.ngrams(n=n) def freq(self, word, docs=None): if docs is None: return self.tokens.count(word) else: if not isinstance(docs, str): d = "" for item in docs: d = "%s %s" % (d, item) docs = d blob = TextBlob(text=docs, tokenizer=self.tokenizer) blob.tokens.extend(self.bigramify(blob)) blob.tokens.extend(self.trigramify(blob)) return blob.tokens.count(word) def tf(self, word): return self.freq(word=word) / float(self.doc_word_count()) def doc_word_count(self): return len(self.tokens) def num_docs_containing(self, word): if self.docs is None: return 2 else: count = 0 for document in self.docs: if self.freq(word=word, docs=document) > 0: count += 1 return 1 + count def idf(self, word): if self.docs is None: docs_length = 1 else: docs_length = len(self.docs) num_docs = self.num_docs_containing(word) return math.log(docs_length / float(num_docs)) def tf_idf(self, word): return self.tf(word) * self.idf(word) def analyze(self, count=10, percentage=False): if not self.doc or not self.docs: raise ValueError("Document and its Documents Set must not be None or empty") score = { 'freq': {}, 'tf': {}, 'idf': {}, 'tf-idf': {} } for token in self.tokens: score['freq'][token] = self.freq(token) score['tf'][token] = self.tf(token) score['idf'][token] = self.idf(token) score['tf-idf'][token] = math.fabs(self.tf_idf(token)) final = {} for token in score['tf-idf']: if token not in final: final[token] = score['tf-idf'][token] else: if score['tf-idf'][token] > final[token]: final[token] = score['tf-idf'][token] if not percentage: self.tfidf = [item for item in sorted(final.items(), key=lambda x: x[1], reverse=True)[:count]] return self.tfidf else: result = [item for item in sorted(final.items(), key=lambda x: x[1], reverse=True)[:count]] max = 0.0 for item in result: if item[1] > max: max = item[1] self.tfidf = [(item[0], "%.2f%%" % (item[1]/max*100)) for item in result] return self.tfidf def graph(self, word): return self.graph_doc(word=word) def graph_doc(self, word): if not self.tfidf: raise ValueError("Please call analyze first before creating a graph") result = {} tris = self.trigramify(tokens=self.blob.tokens, as_string=False) matches = ["%s %s %s" % (tri[0], tri[1], tri[2]) for tri in tris if word in tri[1]] result['tris'] = [item for item in self.tfidf if item[0] in matches] bis = self.bigramify(tokens=self.blob.tokens, as_string=False) matches = ["%s %s" % (bi[0], bi[1]) for bi in bis if word in bi[0] or word in bi[1]] result['twos'] = [item for item in self.tfidf if item[0] in matches] return result
def sentiment(): text = get_text(request) sentiment = TextBlob(text).sentiment[0] # Polarity score return jsonify({"result": sentiment})
def test_format(self): blob = TextBlob('1 + 1 = {0}') assert_equal(blob.format(1 + 1), TextBlob('1 + 1 = 2')) assert_equal('1 + 1 = {0}'.format(TextBlob('2')), '1 + 1 = 2')
def test_replace(self): blob = TextBlob('textblob is a blobby blob') assert_equal(blob.replace('blob', 'bro'), TextBlob('textbro is a broby bro')) assert_equal(blob.replace('blob', 'bro', 1), TextBlob('textbro is a blobby blob'))
def test_strip(self): text = 'Beautiful is better than ugly. ' blob = TextBlob(text) assert_true(is_blob(blob)) assert_equal(blob.strip(), TextBlob(text.strip()))
('I am tired of this stuff.', 'neg'), ("I can't deal with this", 'neg'), ('He is my sworn enemy!', 'neg'), ('My boss is horrible.', 'neg')] test = [('The beer was good.', 'pos'), ('I do not enjoy my job', 'neg'), ("I ain't feeling dandy today.", 'neg'), ("I feel amazing!", 'pos'), ('Gary is a friend of mine.', 'pos'), ("I can't believe I'm doing this.", 'neg')] cl = NaiveBayesClassifier(train) # Classify some text print(cl.classify("Their burgers are amazing.")) # "pos" print(cl.classify("I don't like their pizza.")) # "neg" # Classify a TextBlob blob = TextBlob( "The beer was amazing. But the hangover was horrible. " "My boss was not pleased.", classifier=cl) print(blob) print(blob.classify()) for sentence in blob.sentences: print(sentence) print(sentence.classify()) # Compute accuracy print("Accuracy: {0}".format(cl.accuracy(test))) # Show 5 most informative features cl.show_informative_features(5)
# -*- coding: utf-8 -*- """ Created on Fri Oct 4 09:44:50 2013 @author: ozdemircili """ from text.blob import TextBlob text = TextBlob("Once upon a time a there was a program called Pycheat.It was one of the cheats") text.tags text.noun_phrases text.sentiment text.words text.sentences text.title text.words[-1].singularize() text.words[3].pluralize() from text.blob import Word from text.blob import Verb #Lemmatization