def clean(doc): lemma = WordNetLemmatizer() exclude = set(string.punctuation) stoplist = stopwords.words('english') stoplist = stoplist + wordlist stop = set(stoplist) # stop= stop.append # print type(stop) # print(stop) # exit(0) # Remove punctuation normalized = result = re.sub(r"http\S+", "", doc) normalized = result = re.sub(r"than\S+", "", normalized) normalized = result = re.sub(r"@\S+", "", normalized) normalized = re.sub(r'[^\w\s]', '', normalized).replace(" ", " ") # Standardize words (remove multiple letters): normalized = ''.join(''.join(s)[:2] for _, s in itertools.groupby(normalized)) normalized = TextBlob(normalized) normalized = ' '.join(normalized.noun_phrases) stop_free = " ".join( [i for i in normalized.lower().split() if i not in stop]) punc_free = ''.join(ch for ch in stop_free if ch not in exclude) normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split()) normalized = normalized.lower().strip().replace("\n", " ").replace( ".", " ").replace("-", ' ') # with open('C:\TuDiabetes_Code - Final\TechTypes_Text_New\Physical_Activity_Clean.txt', 'a') as the_file: # the_file.write(normalized) print normalized print "********************************************************" return normalized
def extract(ngrams, dataset, doc_id): # extract keywords print 'Extracting keywords' for i, ngram in enumerate(ngrams): doc = doc_id[i] if field not in dataset[doc]: dataset[doc][field] = set() if doc > 0 and doc % 1000 == 0: print '\t', doc for kw in filter(lambda k: '_' in k, ngram): keyword = kw.replace('_', ' ') kw_tb = TextBlob(keyword) # filter out punctuation, etc (make sure that there are two non-punc words) if len(kw_tb.words) < 2: continue # add keywords which are all proper nouns distinct_tags = set(t[1] for t in kw_tb.tags) if distinct_tags - {'NNP', 'NNPS'} == {}: dataset[doc][field].add(kw_tb.lower()) continue # add noun phrases for np in kw_tb.lower().noun_phrases: dataset[doc][field].add(np) return kw_set_to_list(dataset)
def get_user_timeline_tweets(self, num_tweets): tweets = [] for tweet in Cursor(self.twitter_client.user_timeline, id="@realdonaldtrump").items(num_tweets): t=TextBlob(tweet._json['text']) print t.lower(), t.sentiment tweets.append(tweet) return tweets
def classify(review): blob = TextBlob(review, classifier=cl) blob.lower() blob.correct() for sentence in blob.sentences: if sentence.classify() == "neg" and len(str(sentence)) > 3: negative_sen.append(str(sentence)) elif sentence.classify() == "pos" and len(str(sentence)) > 3: positive_sen.append(str(sentence))
class SexxiBot: """ Main ChatBot class to take in user input and return an appropriate response. Contains methods: fix_typos, to correct any user's typos; help_check, to check if the user has asked for 'help' (a list of possible commands); check_phrase_similarity, to compare user inputs to keywords to generate basic responses; create_response, to generate a new response based on the users input. """ def __init__(self): self.user_input = str() self.input_len = int() self.response = str() def fix_typos(self): self.user_input = TextBlob(self.user_input.lower()).tags # Fix lazy user typos, or slang words = list() for i in self.user_input: words.append(i[0]) for part in range(len(words)): if words[part] in slang_typo_dict.keys(): words[part] = slang_typo_dict[words[part]] self.user_input = ' '.join(words) return False # Returns false to move on to help_check def help_check(self): if self.user_input.lower() == "help": self.response = responses.HELP return True return False # User didn't ask for help, move on to check_phrase_similarity def check_phrase_similarity(self): self.user_input = TextBlob(self.user_input.lower()).tags self.input_len = len(self.user_input) for phrase_type in PHRASE_TYPES: for phrase in getattr(keywords, phrase_type): score = float() for word in self.user_input: for n in phrase: if word and n not in unimportant_words: score += liquidmetal.score(n, word[0]) / self.input_len if score >= 0.7: # Could be increased/ decreased through testing to find more optimal value self.response = random.choice(getattr(responses, phrase_type)) return True return False def create_response(self): # NOT WORKING YET! # Craft a response based on user's message noun, pronoun, verb, adj, prep, text_len = check_pos_tags.pos_tags(self.user_input) self.response = format_response.craft_response(noun, pronoun, verb, adj, prep, text_len) print self.response return False if self.response == ' ' else True
def polarity_and_lang(message): #blob has a limit on api calls try: if len(message) > 2: blob = TextBlob(message) leng = blob.detect_language() text = '' if leng == 'es': blob = blob.translate(to='en').lower() text = message else: blob = blob.lower() text = blob.translate(to='es').lower().raw pol = blob.sentiment[0] else: print('Se paso a polarity_and_lang un texto menor que 3 caracters') pol = 0 text = message except Exception as e: print('Exception en polarity_and_lang: {0}'.format(e)) pol = 0 text = None return (pol, text)
def simplebot(user): """Rule base bot, takes an argument, user input in form of a string. In sequence will pre-process the string. Lower case, tokenize and remove stop words. iterates through CONVERSATION, if filtered_input intersects response_set is updated. if the set is empty, it returns a message, else it returns the longest string in the set""" user_input = user user_blob = TextBlob(user_input) lower_input = user_blob.lower() token_input = lower_input.words filtered_input = [w for w in token_input if w not in STOP_WORDS] response_set = set() for con_list in CONVERSATION: for sentence in con_list: sentence_split = sentence.split() if set(filtered_input).intersection(sentence_split): response_set.update(con_list) if not response_set: return "I am sorry, I don't have an answer, ask again" else: return max(response_set, key=len)
def preprocess_sentence(sentence, output_type='TextBlob'): """ Put in lowecase filter out stopwords and stem a single sentence Input: String or TextBlob object """ if type(sentence) == str: sentence = TextBlob(sentence) elif type(sentence) != textblob.blob.TextBlob: raise ValueError('Input is neither a string or a TextBlob object') sentence = sentence.lower() word_list = sentence.words filtered_words = [ word.stem() for word in word_list if word not in stopwords.words('english') ] if output_type == 'TextBlob': return TextBlob(' '.join(filtered_words)) elif output_type == 'list': return filtered_words elif output_type == 'string': return ' '.join(filtered_words) else: raise ValueError('Output tpye not understood')
def keywords(reviews, numKeywords): """Gets the [numKeywords] most frequent words within the reviews for a product/service excluding stop_words Args: reviews: a list of all the reviews for a product/service numKeywords: The number of keywords we want to have Returns: list; Contains each of the [numKeywords] keywords """ alltext = "" for review in reviews: alltext = alltext + review blob = TextBlob(alltext) words = blob.lower().words num_words = len(words) #count the frequency of each word counts = Counter(word for word in words if word not in stop_words) keywords = {a: b for a, b in counts.most_common(numKeywords)} #quadratic scaling for i in keywords: keywords[i] = keywords[i] / float(num_words) return keywords
def do_in(self, sentence): #converts words to numbers(like two->2) sentence = convert_w2n(sentence) #TextBlob is used preprocessing the input parsed = TextBlob(sentence) parsed = parsed.lower() #correct the spelling if present parsed = parsed.correct() #check main category menu,subcategory menu,and quantity resp = check_for_main_category(parsed) if resp == None: resp = check_for_menu(parsed) if resp == None: resp = check_for_greeting(parsed) if (resp == None): check_for_quantity(parsed) #prints the output if resp != None: print(resp) elif ((len(Chatbot.sub_cat_item) != 0) and (len(Chatbot.quantity_main) != 0)): print "Total cost of your order placed for is", find_cost() else: print(construct_response(parsed))
def preProcessingNeg(l): blob = TextBlob(l[0]) blob = blob.lower() words = blob.words # Tokenizing newOpinion = '' for word in words: if word in good: newOpinion += 'good' + ' ' elif word in bad: newOpinion += 'bad' + ' ' elif word not in junk: # Stemming if word[-1] == 's': word = word[0:-1] if word[-3:] == 'ing': word = word[0:-3] if word[-2:] == 'ly': word = word[0:-2] if word == "n't": word = 'not' if word[-2:] == 'ed': word = word[0:-2] newOpinion += word + ' ' threeWords = mostCommon(newOpinion) for t in threeWords: if t[0] in totalDictNeg: totalDictNeg[t[0]] += t[1] else: totalDictNeg[t[0]] = t[1] return newOpinion
def augment(self, data): """ A method to paraphrase a sentence. :type data: str :param data: sentence used for data augmentation :rtype: str :return: The augmented data """ if type(data) is not str: raise TypeError("DataType must be a string") data = TextBlob(data.lower()) try: data = data.translate(from_lang=self.src, to=self.to) data = data.translate(from_lang=self.to, to=self.src) except NotTranslated: try: # Switch to googletrans to do translation. translator = Translator() data = translator.translate(data, dest=self.to, src=self.src).text data = translator.translate(data, dest=self.src, src=self.to).text except Exception: print("Error Not translated.\n") raise return str(data).lower()
def passage2word(path=trainpath, type='train'): listofword = [] filelist = [] taglist = [] traverse(path, filelist, taglist) if (type == 'train'): # id_tag = open(r'E:\id_tag.txt','wb') with open(os.path.join(nowpath, 'id_tag.txt'), 'wb') as id_tag: pickle.dump(taglist, id_tag) id_tag.close() elif (type == 'test'): global taglist_test taglist_test = taglist global passageNum passageNum = len(filelist) for file in filelist: f = open(file, 'r', errors='ignore') passage = f.read() f.close() str = TextBlob(passage.strip()) wordlist = str.lower().words k = len(wordlist) answer = [] if k > 0: for x in range(k): answer.append(wordlist[x].lemmatize()) answer[x] = Word(answer[x]).lemmatize("v") filtered_answer = [x for x in answer if not x in stop_words] # print(filtered_answer) listofword.append(filtered_answer) return listofword
def update(table, field): conn = boto.dynamodb.connect_to_region('us-west-2', aws_access_key_id='', aws_secret_access_key='') table = conn.get_table(table) for line in table.scan(): newline = line[field] text = TextBlob(newline) text = text.lower() textwords = text.split() wordcount = 0 wordlist = [] for word in textwords: wordcount += 1 if word not in wordlist: wordlist.append(word) #handles div0 errors if wordcount == 0: lexdiv = 0 else: lexdiv = round((len(wordlist) * 1.0) / wordcount, 2) polarity = text.sentiment.polarity subjectivity = text.sentiment.subjectivity line.put_attribute('subjectivity', subjectivity) line.put_attribute('polarity', polarity) line.put_attribute('lexical diversity', lexdiv) line.save()
def get_tweet_sentiment(self, tweet): #Membuat objek TextBlob untuk setiap tweet analysis = TextBlob(self.clean_tweet(tweet)) #Inisialisasi sentimen dan kalimat tweet pos = neg = neu = 0 sentence = analysis.lower() words = sentence.split(' ') for word in words: #Klasifikasi sentimen setiap kata per-huruf dari tweet classResult = classifier.classify(word_feats(word)) if classResult == 'pos': pos += 1 elif classResult == 'neg': neg += 1 elif classResult == 'neu': neu += 1 #print(word, " ", classResult) #print(pos, " ",neg, " ",neu) #Return hasil klasifikasi sentimen if pos > neg and pos >= neu: return 1 elif neg > pos and neg >= neu: return -1 else: return 0
def find_keyword(sentences: list, keyword: str) -> list: '''keyword should be in lower case''' data = [] for sentence in sentences: tb = TextBlob(sentence) if keyword in tb.lower(): data.append(sentence) return data
def get_ngram_counts(text, size): blob = TextBlob(text) # Extract n-grams as WordLists, then convert to a list of strings ngrams = [' '.join(ngram).lower() for ngram in blob.lower().ngrams(size)] # Convert to dataframe then count values and rename columns ngram_counts = pd.DataFrame(ngrams)[0].value_counts().rename_axis( 'ngram').reset_index(name='count') return ngram_counts
def get_word_counts(text): blob = TextBlob(text) words = [[word, count] for word, count in blob.lower().word_counts.items() if word not in stopwords] word_counts = pd.DataFrame(words).rename({ 0: 'word', 1: 'count' }, axis=1).sort_values(by='count', ascending=False) return word_counts
def on_status(self, status): text = status.text print(text) temp = TextBlob(text) temp = temp.lower() for word in temp.words: if word in collection: collection[word] += 1 else: collection[word] = 1
def fit(self, X, y=None): words = [] for x in X: x = TextBlob(x.lower()) words += [word.lemmatize() for word in x.words] if self.num_words: words = Counter(words) self._vocab = [word for word, _ in words.most_common(self.num_words)] else: self._vocab = list(set(words)) return self
def analyse_titles(titles_file='all_titles.txt'): with open(os.path.join('csv', titles_file), 'rb') as text_file: text = text_file.read().decode('ascii', errors="replace") blob = TextBlob(text) word_counts = [[word, count] for word, count in blob.lower().word_counts.items() if word not in STOPWORDS and count > 1] bigrams = [ ' '.join(bigram).lower() for bigram in blob.lower().ngrams(2) if stopwords_check(bigram) ] bigram_counts = [[word, count] for word, count in Counter(bigrams).items() if count > 1] trigrams = [ ' '.join(trigram).lower() for trigram in blob.lower().ngrams(3) if stopwords_check(trigram) ] trigram_counts = [[word, count] for word, count in Counter(trigrams).items() if count > 1] word_counts = sorted(word_counts, key=itemgetter(1), reverse=True)[:20] bigram_counts = sorted(bigram_counts, key=itemgetter(1), reverse=True)[:20] trigram_counts = sorted(trigram_counts, key=itemgetter(1), reverse=True)[:20] np_counts = [[word, count] for word, count in blob.lower().np_counts.items() if count > 1] np_counts = sorted(np_counts, key=itemgetter(1), reverse=True)[:20] print '## Most frequent words\n' for term in word_counts: print '* {} ({})'.format(term[0], term[1]) print '\n\n## Most frequent bigrams\n' for term in bigram_counts: print '* {} ({})'.format(term[0], term[1]) print '\n\n## Most frequent trigrams\n' for term in trigram_counts: print '* {} ({})'.format(term[0], term[1]) print '\n\n## Most frequent noun phrases\n' for term in np_counts: print '* {} ({})'.format(term[0], term[1])
def transform(self, X): vectors = [] for x in X: x = TextBlob(x.lower()) word_count = Counter(x.words) vector = [0] * len(self._vocab) for word, count in word_count.items(): try: idx = self._vocab.index(word) vector[idx] = count except ValueError: pass vectors.append(vector) return vectors
def similarWords(faqWord, inputWords, printToWindow, useSynsets=True): # This method identifies whether two words are similar # Return True or False similarPath = False matchMethod = '' tbFaqWord = TextBlob(faqWord) lowerFAQWord = tbFaqWord.lower() lowerInputWord = inputWords[1] correctedInputWord = inputWords[2] if useSynsets: FAQWordSynset = lowerFAQWord.words[0].synsets # print("Words and Synset Lengths: " + str(lowerFAQWord) + "(" + str(len(lowerFAQWord.words[0].synsets)) + ") & " + str( # lowerInputWord) + "(" + str(len(lowerInputWord.words[0].synsets)) + ") ") wordsAreSimilar = False if lowerFAQWord == lowerInputWord: wordsAreSimilar = True matchMethod = 'identical' elif lowerFAQWord == correctedInputWord: wordsAreSimilar = True matchMethod = 'corrected' elif lowerFAQWord.words.singularize( ) == correctedInputWord.words.singularize(): wordsAreSimilar = True matchMethod = 'singularized' elif useSynsets: if len(FAQWordSynset) > 0: #correctInputWordSynsets = correctedInputWord.words[0].synsets correctInputWordSynsets = inputWords[3] if len(correctInputWordSynsets) > 0: # print("Compare Synsets: " + str(correctedFAQWord.words[0].synsets[0].path_similarity(correctedInputWord.words[0].synsets[0]))) correctedWordSimilarity = FAQWordSynset[0].wup_similarity( correctInputWordSynsets[0]) if correctedWordSimilarity is not None: if correctedWordSimilarity >= 0.65: wordsAreSimilar = True similarPath = True matchMethod = 'similarity' # printToWindow = True if printToWindow & wordsAreSimilar: print("Words: " + str(lowerFAQWord) + " & " + str(lowerInputWord) + " (" + str(matchMethod) + ")") if similarPath: print("Synset path similarity: " + str(correctedWordSimilarity)) return wordsAreSimilar, matchMethod
def _preprocess(text): ''' Preprocess the text 1. Converting numbers and variables to a dummy word "hey_num" 2. Converting all the letters to lowercase 3. Correcting any spelling mistakes ''' processed_text = HeyAC._word_to_digit(text) processed_text, list_var = HeyAC._digit_to_dummy(processed_text) processed_text = TextBlob(processed_text) processed_text = processed_text.lower() #processed_text = processed_text.correct() processed_text = str(processed_text) return processed_text, list_var
def simplebot(user_input): user_blob = TextBlob(user_input) lower_input = user_blob.lower() token_input = lower_input.words filtered_input = [w for w in token_input if w not in STOP_WORDS] response_set = set() for con_list in CONVERSATION: for sentence in con_list: sentence_split = sentence.split() if set(filtered_input).intersection(sentence_split): response_set.update(con_list) if not response_set: return "I'm sorry, ask again please!" else: return max(response_set, key=len)
def similarWords(faq_word, input_words, print_to_window, use_synsets=True): # This method identifies whether two words are similar # Return True or False similar_path = False match_method = '' tb_faq_word = TextBlob(faq_word) lower_faq_word = tb_faq_word.lower() lower_input_word = input_words[1] corrected_input_word = input_words[2] if use_synsets: faq_word_synset = lower_faq_word.words[0].synsets # print("Words and Synset Lengths: " + str(lower_faq_word) + "(" + str(len(lower_faq_word.words[0].synsets)) + ") & " + str( # lower_input_word) + "(" + str(len(lower_input_word.words[0].synsets)) + ") ") words_are_similar = False if lower_faq_word == lower_input_word: words_are_similar = True match_method = 'identical' elif lower_faq_word == corrected_input_word: words_are_similar = True match_method = 'corrected' elif lower_faq_word.words.singularize( ) == corrected_input_word.words.singularize(): words_are_similar = True match_method = 'singularized' elif use_synsets: if len(faq_word_synset) > 0: #correct_input_word_synsets = corrected_input_word.words[0].synsets correct_input_word_synsets = input_words[3] if len(correct_input_word_synsets) > 0: corrected_word_similarity = faq_word_synset[0].wup_similarity( correct_input_word_synsets[0]) if corrected_word_similarity is not None: if corrected_word_similarity >= 0.65: words_are_similar = True similar_path = True match_method = 'similarity' # printToWindow = True if print_to_window & words_are_similar: print("Words: " + str(lower_faq_word) + " & " + str(lower_input_word) + " (" + str(match_method) + ")") if similar_path: print("Synset path similarity: " + str(corrected_word_similarity)) return words_are_similar, match_method
def createInputWordTuple(inputSentence, useSynsets=True): #This method creates a tuple for each word in the inputSentence correctedInputWordSynsets = '' inputMessageWords = [] # input Words # <Word, lowerWord, correctedWord, wordSynset> for inputWord in inputSentence.words: tbInputWord = TextBlob(inputWord) lowerInputWord = tbInputWord.lower() correctedInputWord = lowerInputWord.correct() if useSynsets: correctedInputWordSynsets = correctedInputWord.words[0].synsets inputMessageWords.append( (inputWord, lowerInputWord, correctedInputWord, correctedInputWordSynsets)) return inputMessageWords
def textblob_ngrams(sentence, n=3, remove_stopwords=False, all_lower_case=False): ''' Takes in a sentence returns the words and/or punctuation in that sentence as the features (depending on chosen tokenizer) @Arguments: sentence -- Chosen sentence to tokenize, type(sentence) = String tokenizer (optional) -- Function of type nlkt.tokenize to be used for breaking apart the sentence string. Standard tokenizer splits on whitespace and removes punctuation remove_stopwords (optional) -- if true, all stopwords in sentence will not be included as features. Currently only for English text. Value is initially false stemmer (optional) -- Function of type nltk.stem to be used for stemming word features. @Return: List of features of the following form: {ngram_1: True, ngram_2: True, ... , ngram_n: True} ''' sentence = TextBlob(sentence) features = dict() clean_string = '' # Changes all word features to lower case if true if all_lower_case: sentence = sentence.lower() # Removes stopwords for word in sentence.words: # Removes word from features if in nlkt.corpus.stopwords('english') if remove_stopwords: if word.string in stopwords: continue clean_string += ''.join([word, ' ']) for ngram in TextBlob(clean_string): features[ngram] = True return features
def findmostusedword(excluded): wordlist = [] nouns = ['NN', 'NNS', 'NNP', 'NNPS'] prev = [0, 0] for headline in news: for word in headline.split(): word = word.lower().replace('--', '') wordlist.append(word) for word in wordlist: if not word in nouse: word = TextBlob(word) if word.tags == []: continue if word.tags[0][1] in nouns: if wordlist.count( word.lower()) > prev[1] and word.tags[0][0] != "-": prev = [word.tags[0][0], wordlist.count(word)] return (prev)
def analyze(liste_domaine, liste_tweet): #prend en entrée une liste de textes de tweet et # une liste avec le champ lexical d'un domaine EN MINUSCULE ###RECUPERE LES MOTS TROUVES DANS LES TWEETS DU CHAMP LEXICAL CORRESPONDANT mot_a_trouver = liste_domaine tweet_a_analyser = liste_tweet L = [] #dans la liste L, on ajoute les mots du domaine trouvés dans les tweets for tweet in liste_tweet: current_tweet = TextBlob(tweet) current_tweet = current_tweet.lower() words = current_tweet.words for word in words: if word in mot_a_trouver: L.append(word) return (L)
def bias(text_file): bias = 0 tokenized = tokenize(text_file) red_flags = ["alien", 'evil', 'monster', 'good', 'aliens'] for w in range(len(tokenized)): word = TextBlob(tokenized[w]) if word.sentiment.polarity < -.5: bias = bias + .5 if word.sentiment.polarity > .5: bias = bias + .5 if word.sentiment.polarity > -.5 and word.sentiment.polarity < 0: bias = bias + .25 if word.sentiment.polarity < .5 and word.sentiment.polarity > 0: bias = bias + .25 if word.sentiment.polarity > .5: bias = bias + 1 if word.sentiment.polarity < -.5: bias = bias + 1 if word.lower() in red_flags: bias = bias + 2 print(word) return (bias / len(tokenized)) * 100
def hello_monkey(): """Respond to incoming calls with a simple text message.""" resp = twiml.Response() message="" name="" fromNumber = request.values.get('From',None) myNumber = request.values.get('To',None) body = request.values.get('Body') body = body.decode("ascii", errors="ignore") blob = TextBlob(body) NLPObject = NLPStuff(resp, blob, message) counter = storeCookies(blob) message+= salutationToCaller(message, fromNumber, myNumber, counter) if "help" in blob.lower(): message="This is an information HELP message please tell me what to do" return setMessage(message, name, myNumber, counter, body, blob, resp)
def feature_extractor(text): if not isinstance(text, TextBlob): text = TextBlob(text.lower()) return { 'has_rumor': 'rumor' in text.words, 'has_gosip': 'gosip' in text.words, 'has_urbanesia': 'urbanesia' in text.words, 'has_batista': 'batista' in text.words, 'has_harahap': 'harahap' in text.words, 'has_pemasaran': 'pemasaran' in text.words, 'has_saham': 'saham' in text.words, 'has_hackathon': 'hackathon' in text.words, 'has_ipo': 'ipo' in text.words, 'has_akuisisi': 'akuisisi' in text.words, 'has_startup': 'startup' in text.words, 'has_android': 'android' in text.words, 'has_aplikasi': 'aplikasi' in text.words, 'has_payment': 'payment' in text.words, 'has_pembayaran': 'pembayaran' in text.words, 'has_api': 'api' in text.words, 'has_kompetisi': 'kompetisi' in text.words, 'has_ide': 'ide' in text.words, 'has_permainan': 'permainan' in text.words, 'has_game': 'game' in text.words, 'has_fundraising': 'fundraising' in text.words, 'has_askds': '[Ask@DailySocial]' in text.words, 'has_investasi': 'investasi' in text.words, 'has_musik': 'musik' in text.words, 'has_lagu': 'lagu' in text.words, 'has_bhinneka': 'bhinneka' in text.words, 'has_marketplace': 'marketplace' in text.words, 'has_mobile': 'mobile' in text.words, 'has_cto': 'cto' in text.words, 'has_traffic': 'traffic' in text.words, 'starts_with_[': text[0] == '[' }
def tweet_analyser(self, ch, method, properties, body): # Extract the json from the tweet json_body = json.loads(body.decode('utf-8')) text = json_body['text'] hashtags = json_body['hashtags'] # Use TextBlob for sentiment analysis on the tweet and extract the sentiment text_blob = TextBlob(text) json_body['polarity'] = text_blob.sentiment.polarity # Try to extract the tweeted team from the names used for both teams tweeted_team = self.get_team_by_names(text) # If we managed to extract a team, save the tweet along with the name of the team if tweeted_team is not None: json_body['team'] = tweeted_team.name self.collection.insert_one(json_body) # If not, try to extract the tweeted team based on the teams' hashtags else: tweeted_team = self.get_team_by_players(text_blob.lower().words) if tweeted_team is not None: json_body['team'] = tweeted_team.name self.collection.insert_one(json_body) else: if hashtags is not None: tweeted_team = self.get_team_by_hashtags([x['text'].lower() for x in hashtags]) if tweeted_team is not None: json_body['team'] = tweeted_team.name self.collection.insert_one(json_body) else: json_body['team'] = 'None' self.collection.insert_one(json_body) print("IGNORED : {}".format(text))
def getContent(tweet): txt = TextBlob(tweet['text'].split('https', 1)[0], np_extractor=extractor) txt = txt.lower() NP = txt.noun_phrases Subj = txt.pos_tags sentiment = txt.sentiment # Convert Noun Phrases from unicode to str before adding npToAdd = list() for np in NP: np = np.encode('ascii', 'ignore') npToAdd.append(np) # Filter words for greater importance (nouns, proper nouns, etc.) SubjToAdd = list() for word in Subj: # print word if word[1] in pos: # Looking for nouns, or subject (i.e. movie, music, color) SubjToAdd.append(word) # Create new Tweet objects twt = Tweet() twt.addNew(tweet, npToAdd, SubjToAdd, sentiment) listOfTweets.append(twt)
def tweet_to_feat(tweet, features): tb = TextBlob(tweet) #lang = tb.detect_language() words = [word.lemma for word in tb.lower().tokenize()] return [words.count(feature) for feature in features]
from __future__ import print_function from textblob import TextBlob from nltk.stem.wordnet import WordNetLemmatizer import sys lmtzr = WordNetLemmatizer() for line in sys.stdin.readlines(): blob = TextBlob(line.strip()) sys.stdout.write("Detected language: {}\n".format(blob.detect_language())) sys.stdout.write("This message had {} words.\n".format(len(blob.words))) sys.stdout.write("Corrected sentence\n{}\n".format(blob.lower().correct())) proper_nouns = [tag[0] for tag in blob.tags if tag[1] == 'NNP'] verbs = [lmtzr.lemmatize(tag[0], 'v') for tag in blob.tags if 'V' in tag[1]] sys.stdout.write("I found these proper nouns: {}\n".format(proper_nouns)) sys.stdout.write("I found these verbs: {}\n".format(verbs)) sentiment = blob.sentiment sys.stdout.write("Sentiment for that message: {}\n".format(sentiment)) if sentiment.polarity > 0 and sentiment.subjectivity > 0.7: sys.stdout.write("That sounds amazing!\n") elif sentiment.polarity < 0 and sentiment.subjectivity > 0.7: sys.stdout.write("It'll get better.\n") else: sys.stdout.write("Meh.\n") sys.stdout.flush()
if doc > 0 and doc % 1000 == 0: print '\t', doc for kw in filter(lambda k: '_' in k, ngram): keyword = kw.replace('_', ' ') kw_tb = TextBlob(keyword) # filter out punctuation, etc (make sure that there are two non-punc words) if len(kw_tb.words) < 2: continue # add keywords which are all proper nouns distinct_tags = set(t[1] for t in kw_tb.tags) if distinct_tags - {'NNP', 'NNPS'} == {}: dataset[doc][field].add(kw_tb.lower()) continue # add noun phrases for np in kw_tb.lower().noun_phrases: dataset[doc][field].add(np) # convert set into list for json serialization for d in dataset: d[field] = list(d[field]) # fix 's for i, np in enumerate(d[field]): if np.endswith(" 's"): np = np[:-3]
# save it as a TextBlob object review = TextBlob(yelp_best_worst.text[0]) # list the words review.words # list the sentences review.sentences # some string methods are available review.lower() # ## Part 6: Stemming and Lemmatization # **Stemming:** # # - **What:** Reduce a word to its base/stem/root form # - **Why:** Often makes sense to treat related words the same way # - **Notes:** # - Uses a "simple" and fast rule-based approach # - Stemmed words are usually not shown to users (used for analysis/indexing) # - Some search engines treat words with the same stem as synonyms # initialize stemmer stemmer = SnowballStemmer('english')
now = datetime.datetime.now() normSource = normmd.tables.get('Source') sources = [dict(row) for row in normcon.execute(select([ normSource.c.Id, normSource.c.Name, normSource.c.Content ]).where( normSource.c.Content.isnot(None) ))] lemmafrequency = {} for source in sources: if args.verbosity > 1: print "Reading source: " + source['Name'] content = TextBlob(source['Content']) noun_phrases = content.lower().noun_phrases lemmas = noun_phrases.lemmatize() for lemma in lemmas: if lemma in lemmafrequency.keys(): lemmafrequency[lemma] += 1 else: lemmafrequency[lemma] = 1 if args.limit > 0: args.limit -= 1 if args.limit == 0: break normNode = normmd.tables.get('Node') normTagging = normmd.tables.get('Tagging') nounPhraseNode = normcon.execute(select([
def find_tweet(tweet, place_list): t = TextBlob(unicode(tweet)) tweet_loc = [] for word in t.lower().tokenize(): if word in place_list: tweet_loc = tweet_loc + [word]