def translate(text): blob = TextBlob(text) transl = blob.translate(to='en') resuljsondetail(blob.tags, blob.noun_phrases, blob.word_counts, blob.words, blob.tokenize(), blob.sentiment_assessments, transl) return transl
def analisis(text): blob = TextBlob(text) #lang = blob.detect_language() transl = '' polarity = 0 #sentences = blob.sentences #if (lang != 'en'): transl = blob.translate(to='en') enBlob = transblob(str(transl)) blob = enBlob sentences = enBlob.sentences for sentence in sentences: polarity += sentence.sentiment.polarity percent = round(polarity * 100) print(percent) result = jsonify({ "polarity": percent, "positive": posneg(percent), "negative": neg(percent), "isHoax": is_hoax(percent), #"language":lang, "tags": blob.tags, "noun_phrases": blob.noun_phrases, "word_counts": blob.word_counts, "words": blob.words, "tokenize": blob.tokenize(), "sentiment_assessments": blob.sentiment_assessments, "translation": transl }) return result
def process_order(self, order: str, source: sr.Microphone): """ Converted voice command is processed using nltk,TextBlob.vectorizer Command sentence is tokenized and filtered in purpose to catch "hot" words and decide if sentence is connected with any implemented feature. :param order: str Voice command converted to text :param source: speech_recognition.Microphone object of speech_recognition.Microphone, which represents a physical microphone on the computer :return: None """ sentence_to_analyze = TextBlob(order) self.check_sentence_polarity(sentence_to_analyze) order_vector = self.vectorizer.transform([order]).toarray() command_category = self.classifier.predict(order_vector)[0] if self.check_command_category(source, command_category): tokenized_order = sentence_to_analyze.tokenize() preprocess_order = [ word for word in tokenized_order if word not in stopwords.words('english') ] self.commands[command_category](source, preprocess_order) else: self.convert_text_to_speech(BasicPhrases.NO_COMMEND)
def match_syntagm_text_blob_multi(syntagms, text): from textblob import TextBlob from textblob_fr import PatternTagger, PatternAnalyzer blob = TextBlob(text, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer()) return match_sequences(syntagms, list(blob.tokenize()))
def process_keywords(input_file): list = [] for line in input_file: blob = TextBlob(line, pos_tagger=nltkTagger) kwds = blob.tokenize() if len(kwds) > 2: kwds = blob.noun_phrases list.append(kwds) return list
def translate(text): blob = TextBlob(text) transl = blob.translate(to='en') return { "tags": blob.tags, "noun_phrases": blob.noun_phrases, "word_counts": blob.word_counts, "words":blob.words, "tokenize":blob.tokenize(), "sentiment_assessments":blob.sentiment_assessments, "translation":transl }
def __call__(self, raw_data: str): """Transform the raw_data to a new data """ word_list = [] # corrrect the words t = TextBlob(raw_data).correct() # t.tokenize() # for w in t.tokenize(): # w = self.st.stem(w.lower()) # word_list.append(w) word_list = [self.st.stem(w.lower()) for w in t.tokenize()] s = " ".join(word_list) tb = TextBlob(s).correct() return str(tb)
def translate(text): blob = TextBlob(text) lang = blob.detect_language() transl = '' p = 0 sa = [] if lang != 'en': #selain bahasa inggris masuk kesini transl = blob.translate(to='en') pol = polarity(str(transl)) p = pol[0] sa = pol[1] else: p = polarity(text)[0] sa = blob.sentiment_assessments return [ transl, lang, blob.tags, blob.noun_phrases, blob.word_counts, blob.words, blob.tokenize(), sa, text, p ]
entry = {'word': word, 'synonyms': []} for synset in word.get_synsets(pos='n'): for syn in synset.lemmas(): entry['synonyms'].append(syn.name().replace('_', ' ')) verbs.append(entry) elif tag == "JJ": entry = {'word': word, 'synonyms': []} for synset in word.get_synsets(pos='n'): for syn in synset.lemmas(): entry['synonyms'].append(syn.name().replace('_', ' ')) adjectives.append(entry) for token in processed_paragraph.tokenize(): for entry in nouns: if token == entry['word']: if len(entry['synonyms']) != 0: synonym = random.choice(entry['synonyms']) paragraph = paragraph.replace(token, synonym) for entry in verbs: if token == entry['word']: if len(entry['synonyms']) != 0: synonym = random.choice(entry['synonyms']) paragraph = paragraph.replace(token, synonym)
from textblob import TextBlob text = "What am i are you dooing here?" blob = TextBlob(text) ans = blob.tokenize() print ans ''' t = blob.correct() print t word = blob.words print word word = blob.word_counts print word for every in blob.sentiment_assessments: print every ''' snt = blob.sentiment.polarity print snt
def noun_phrases(text): blob = TextBlob(text) return blob.tokenize()
from textblob import TextBlob #first, we need to import TextBlob, the package that will help us analyze text source_text = "Don't tell me the moon is shining; show me the glint of light on broken glass." #chekov processed_text = TextBlob( source_text) #in order for us to process it, we pass it to TextBlob nouns = [] #this is the list where we will store all our nouns verbs = [] #this one is where we will store verbs adjectives = [] #this is where we store adjectives print "\n======================\n" # TOKENIZING is the process by which you can separate the sentence in individual tokens (essentially words, suffixes, punctuation) for word in processed_text.tokenize(): print word print "\n======================\n" # PARTS OF SPEECH allows you to get the grammatical role of each word for word, tag in processed_text.tags: print "word: %s || part-of-speech: %s" % (word, tag) if tag == "NN": #here we are looking for nouns (NN) nouns.append(word) #if we find one, we append it to our list elif tag == "VB": #here we look for verbs (VB) verbs.append(word) elif tag == "JJ": #and here for adjectives adjectives.append(word) print "\n======================\n" # SYNSETS are specific structures related to Wordnet, through which you can get all the sets of related words given a specific word
print source custom_dictionary = [] for word, tag in processed.tags: if tag == 'NN': entry = { # each of our entries in our dictionary 'word': word, # has the initial word 'others': [] # as well as a list of other possibilities } for synset in word.get_synsets(pos="n"): for syn in synset.lemmas(): # here we loop through the list of lemmas that are related to the current noun entry['others'].append(syn.name().replace('_', ' ')) # we also replace any possible '_' character with a ' ' space character when we add it to our list of other possibilities if syn.antonyms(): entry['others'].append(syn.antonyms()[0].name().replace('_', ' ')) custom_dictionary.append(entry) # then we add the entry to our dictionary # this the part where we actually replace the source text for token in processed.tokenize(): # we need to tokenize it in order to make sure we get each part of the sentences for entry in custom_dictionary: # then for each token, we go through our custom dictionary if token == entry['word']: # if we match the word if len(entry['others']) != 0: # and if we actually do have a word to replace it with! other = random.choice(entry['others']) # then we pick a random alternative source = source.replace(token, other) # and we replace it in the source text print source
Created on Mon Apr 9 17:25:19 2018 @author: miaoji """ import nltk.tokenize as nt from textblob import TextBlob import time start_time = time.time() in_file = open("/data/zhangbin/caozhaojun/true_procress_data/daodao_en.txt", 'r') out_file = open("handle_daodao_en.txt", 'a+') tokenizer = nt.TweetTokenizer() line_id = 0 for line in in_file.readlines(): line_id += 1 if line_id % 1000 == 0: print(line_id) correct_line = TextBlob(line.lower().replace('...', ' ').strip()) #.correct() token_line = correct_line.tokenize(tokenizer) final_line = ' '.join([word for word in token_line]) out_file.write(final_line + '\n') in_file.close() out_file.close() end_time = time.time() print(float(end_time - start_time))
# toNote: pluralize & singularize! print(attack_blob.words.singularize()) print(attack_blob.words.pluralize()) print(attack_blob.word_counts['of']) print(attack_blob.ngrams(n=2)) print(attack_blob.ngrams(n=4)) from textblob import Word for word in attack_blob.words: print(Word(word).correct() == word) #%% Example from https://www.analyticsvidhya.com/blog/2018/02/natural-language-processing-for-beginners-using-textblob/ av_blob = TextBlob("Analytics Vidhya is a great platform to learn data science. \n It helps community through blogs, hackathons, discussions,etc.") print(av_blob.tokenize()) print(av_blob.sentences, av_blob.sentences[0]) for phrase in av_blob.noun_phrases: print(phrase) # analytics vidhya; great platform; data science # toNote: part-of-speech tagging for words, tag in av_blob.tags: print(words, tag) # inflection - process of word formation in which characters are added to the base form of a word to express grammatical meanings. # words inflection and lemmatization print(av_blob.sentences[1].words[1].singularize()) # helps -> help # pluralize w = Word('Platform')
# Sentiment analyzer train onto movies reviews from textblob import TextBlob from textblob.sentiments import NaiveBayesAnalyzer blob = TextBlob("I love this library", analyzer=NaiveBayesAnalyzer()) blob.sentiment # Tokenizer from nltk.tokenize import TabTokenizer tokenizer = TabTokenizer() blob = TextBlob("This is\ta rather tabby\tblob.", tokenizer=tokenizer) blob.tokens #This is an alternative way tokenizer = BlanklineTokenizer() blob = TextBlob("A token\n\nof appreciation") blob.tokenize(tokenizer) # Noun phrase chunkers from textblob.np_extractors import ConllExtractor extractor = ConllExtractor() blob = TextBlob("Python is a high-level programming language.", np_extractor=extractor) blob.noun_phrases # POS taggers from textblob.taggers import NLTKTagger nltk_tagger = NLTKTagger() blob = TextBlob("Tag! You're It!", pos_tagger=nltk_tagger) blob.pos_tags # Parser from textblob.parsers import PatternParser