def _translate_message(bot, broadcast_list, context): if context and "autotranslate" in context: _autotranslate = context["autotranslate"] origin_language = _get_room_language(bot, _autotranslate["conv_id"]) for send in broadcast_list: target_conversation_id = send[0] response = send[1] target_language = _get_room_language(bot, target_conversation_id) if origin_language != target_language: logger.debug("translating {} to {}".format(origin_language, target_language)) translated = _autotranslate["event_text"] try: en_blob = TextBlob(_autotranslate["event_text"]) translated = "{0}".format(en_blob.translate(to=target_language)) #translated = gs.translate(_autotranslate["event_text"], target_language except Exception: logger.debug("Translation Api returned string unchanged") else: pass finally: if _autotranslate["event_text"] != translated: # mutate the original response by reference response.extend([ hangups.ChatMessageSegment('\n', hangups.SegmentType.LINE_BREAK), hangups.ChatMessageSegment('(' + translated + ')')])
def update_book(book): blob = TextBlob(book.description) if blob.detect_language() == 'en': description = '' nouns = filter(lambda x: x[1] == 'NN' or x[1] == 'NNP', blob.tags) for noun, tag in nouns: description += noun + " " if len(noun) > 2: description += TextBlob(noun).translate(to='ko').string + " " else: description = book.description book_document = search.Document( doc_id=book.ISBN, fields=[ search.TextField(name='title', value=remove_punc(book.title)), search.TextField(name='author', value=remove_punc(book.author)), search.TextField(name='description', value=remove_punc(description)) ] ) index = get_book_index() index.put(book_document)
def tokenize(text, spell=False, stem=False, lemma=False, lower=False, stop=False): # lowercase, remove non-alphas and punctuation b = TextBlob(unicode(text, 'utf8')) if spell: b = b.correct() words = b.words if lower: words = words.lower() if lemma: words = words.lemmatize() if stem: words = [stemmer.stem(w) for w in words] if stop: tokens = [w.encode('utf-8') for w in words if w.isalpha() and w not in stopwords] else: tokens = [w.encode('utf-8') for w in words if w.isalpha()] # letters_only = re.sub("[^a-zA-Z]", " ", text) # # ngrams # temp_list = [] # for i in range(1,ngram+1): # temp = [list(i) for i in TextBlob(' '.join(tokens)).ngrams(i)] # try: # if len(temp[0]) == 1: # temp_list.extend([i[0] for i in temp]) # else: # for i in temp: # temp_list.append(tuple(i)) # except: # pass # return temp_list return tokens
def tag_documents_text(client): documents = client['cornell']['documents'] for doc in documents.find(): blob = TextBlob(doc['text'], pos_tagger=PerceptronTagger()) parsed_blob = blob.parse() documents.update({'name':doc['name']},{'$set':{'parsed_perceptron':parsed_blob}})
def extract(ngrams, dataset, doc_id): # extract keywords print 'Extracting keywords' for i, ngram in enumerate(ngrams): doc = doc_id[i] if field not in dataset[doc]: dataset[doc][field] = set() if doc > 0 and doc % 1000 == 0: print '\t', doc for kw in filter(lambda k: '_' in k, ngram): keyword = kw.replace('_', ' ') kw_tb = TextBlob(keyword) # filter out punctuation, etc (make sure that there are two non-punc words) if len(kw_tb.words) < 2: continue # add keywords which are all proper nouns distinct_tags = set(t[1] for t in kw_tb.tags) if distinct_tags - {'NNP', 'NNPS'} == {}: dataset[doc][field].add(kw_tb.lower()) continue # add noun phrases for np in kw_tb.lower().noun_phrases: dataset[doc][field].add(np) return kw_set_to_list(dataset)
def hi(bot, trigger): lang_codes = ['af', 'ga', 'sq', 'it', 'ar', 'ja', 'az', 'kn', 'eu', 'ko', 'bn', 'la', 'en'] trans = TextBlob('Greetings dear '+trigger.nick+'on the road of life ') ind = randint(0, 12) trans = trans.translate(to=lang_codes[ind]) saying = str(trans) bot.say(saying)
def on_command(self, msg, stdin, stdout, reply): # pre-process args # this might mess up if "from" or "to" is left out and # the message contains "from" or "to" self._push_character(msg["args"], "from", "-", 1) self._push_character(msg["args"], "to", "-", 1) try: args = self.parser.parse_args(msg["args"][1:]) except (argparse.ArgumentError, SystemExit): return # get message from the appropriate place if args.message: message = " ".join(args.message) else: message = stdin.read().strip() # translate from_lang = args.from_language to_lang = args.to_language message = TextBlob(message) try: translated = message.translate(from_lang=from_lang, to=to_lang) except: pass else: print(translated, file=stdout)
def _german(self, text): blob = TextBlob(text) try: return str(blob.translate(to="en")) except: return text
def process_status(status, lang): text = "" # translate if lang == 'en': text = status['text'] else: blob = TextBlob(status['text']) try: text = str(blob.translate()) except textblob.exceptions.NotTranslated: text = status['text'] # sentiment analysis sentiment = TextBlob(text).sentiment return { "created_at": 1000 * int(time.mktime((status['created_at']).timetuple())) , "id_str": status['id_str'] , "text": text , "sentiment": {"polarity": sentiment[0], "subjectivity": sentiment[1]} , "retweet_count": status['retweet_count'] , "in_reply_to_status_id_str": status['in_reply_to_status_id_str'] , "geo": status['geo'] , "retweeted": status['retweeted'] , "in_reply_to_user_id_str": status['in_reply_to_user_id_str'] }
def matchRhyme(word1,word2): #str1 = "tekst" #word1+="टेक्स्ट" str1 = "" str2 = "" word1+= "टेक्स्ट" word2+= "टेक्स्ट" str1 += " " + word1 str2 += " " + word2 hindi_blob1 = TextBlob(str1) hindi_blob2 = TextBlob(str2) transliteratedtxt1 = hindi_blob1.translate(from_lang="hi", to='en') transliteratedtxt1=transliteratedtxt1.substring[:-5] transliteratedtxt2 = hindi_blob2.translate(from_lang="hi", to='en') transliteratedtxt2= transliteratedtxt2.substring[:-5] word1Index= len(transliteratedtxt1) word2Index= len(transliteratedtxt2) ##Matcing last charater if they are same!! if (transliteratedtxt1[word1Index-1] == transliteratedtxt2[word2Index-1]): #rhymeMeter=3; ##Matching if second Last character is any of the Matras!! if ( ((transliteratedtxt1[word1Index-2]=='a') and (transliteratedtxt2[word2Index-2]=='a')) or ((transliteratedtxt1[word1Index-2]=='e') and (transliteratedtxt2[word2Index-2]=='e'))or ((transliteratedtxt1[word1Index-2]=='o') and (transliteratedtxt2[word2Index-2]=='o')) or ((transliteratedtxt1[word1Index-2]=='i') and (transliteratedtxt2[word2Index-2]=='i')) or ((transliteratedtxt1[word1Index-2]=='u') and (transliteratedtxt2[word2Index-2]=='u')) ): rhymeMeter=5 else: if(transliteratedtxt1[word1Index-2]!=transliteratedtxt1[word1Index-2]): rhymeMeter=4 return rhymeMeter
def gen_translate(msg, fromlang=None, outputlang='en'): try: blob = TextBlob(msg) blob = blob.translate(from_lang=fromlang, to=outputlang) return str(blob) except NotTranslated: return msg
def scrape(self,links=[],ads=True,translator=False): responses = [] values = {} data = [] if ads: for link in links: r = requests.get(link) responses.append(r) else: for link in links: r = requests.get(link) text = unidecode(r.text) html = lxml.html.fromstring(text) links = html.xpath("//div[@class='cat']/a/@href") for link in links: if len(self.base_urls) > 1 or len(self.base_urls[0]) > 3: time.sleep(random.randint(5,27)) try: responses.append(requests.get(link)) print link except requests.exceptions.ConnectionError: print "hitting connection error" continue for r in responses: text = r.text html = lxml.html.fromstring(text) values["title"] = html.xpath("//div[@id='postingTitle']/a/h1")[0].text_content() values["link"] = unidecode(r.url) values["new_keywords"] = [] try: values["images"] = html.xpath("//img/@src") except IndexError: values["images"] = "weird index error" pre_decode_text = html.xpath("//div[@class='postingBody']")[0].text_content().replace("\n","").replace("\r","") values["text_body"] = pre_decode_text try: values["posted_at"] = html.xpath("//div[class='adInfo']")[0].text_content().replace("\n"," ").replace("\r","") except IndexError: values["posted_at"] = "not given" values["scraped_at"] = str(datetime.datetime.now()) body_blob = TextBlob(values["text_body"]) title_blob = TextBlob(values["title"]) values["language"] = body_blob.detect_language() #requires the internet - makes use of google translate api values["polarity"] = body_blob.polarity values["subjectivity"] = body_blob.sentiment[1] if values["language"] != "en" and not translator: values["translated_body"] = body_blob.translate(from_lang="es") values["translated_title"] = title_blob.translate(from_lang="es") else: values["translated_body"] = "none" values["translated_title"] = "none" text_body = values["text_body"] title = values["title"] values["phone_numbers"] = self.phone_number_parse(values) data.append(values) return data
def getKeywords(text, useless): text = TextBlob(text) for word in text.words: for bad in useless: if word is bad: text.remove(word) return text
def answer(question): global IsAnswer,detected,u IsAnswer = True DetectLang = TextBlob(question) detected = DetectLang.detect_language() if detected == 'en': print("language detected: en") u = 'en' print(len(words),"len(words)") low = question.lower() questions = re.sub('[^\w]',' ',low).split() #list BadWords(questions) print(questions) def writeout(words,question,IsAnswer): r = [] if len(words) > 3000: a1 = len(questions) for x in range(0,a1): words.remove(random.choice(words)) print(len(words),"len(words)") else: pass os.remove('newwords.txt') file = open('newwords.txt','w') words.extend(questions) r.extend(words) s = ' '.join(r) file.write(s) writeout(words,question,IsAnswer) randomthought() else: u = detected print("language detected:",u) randomthought()
def check_speech_patterns(text): PATTERNS={ ("PRP","DT"), ("CC","VBD"), ("VB","RB"), ("VB","PRP$"), ("NN","POS"), ("NN","MD","VB"), ("VB","PRP$","NN"), ("MD","VB","VBN"), ("NN","IN","PRP$"), ("IN","PRP$","JJ"), ("VB","PRP","DT","NN"), ("VBD","RB","JJ","NNS"), ("NNP","NNP","NNP","NNP"), ("PRP$","NN","CC","PRP"), ("NNP", "NNP", "NNP", "NNP", "NNP"), ("NN", "IN", "DT", "NNS", "IN"), ("PRP$", "NN", "IN", "DT", "NN"), ("IN", "DT", "NN", "WDT", "VBZ"), ("NN", "IN", "PRP$", "JJ", "NN"), ("DT", "NN", "IN", "NN", "NN") } blob= TextBlob(text) for i in range (2,6): ngrams=blob.ngrams(n=i) for gram in ngrams: str_gram=" ".join(gram) gram_blob=TextBlob(str_gram) tags=gram_blob.tags lst1, lst2 = zip(*tags) if lst2 in PATTERNS: return True return False
def sentiment_pattern(text, gram_n=6): blob= TextBlob(text) ngrams=blob.ngrams(n=gram_n) sentiment_list=[] datalist = [] for gram in ngrams: str_gram=" ".join(gram) print str_gram data = (0, 0, str_gram, None) datalist.append(Datapoint(*data)) #gram_blob=TextBlob(str_gram) #sentiment=gram_blob.sentiment[0] #if sentiment>0: # sentiment=1 #elif sentiment<0: # sentiment=-1 #sentiment_list.append(sentiment) predictor = pickle.load(open("predictor.pickle", "rb" ) ) prediction = predictor.predict(datalist) for sentiment in prediction: sentiment = int(sentiment) if sentiment < 2: sentiment_list.append(-1) if sentiment == 2: sentiment_list.append(0) if sentiment > 2: sentiment_list.append(1) print sentiment_list return sentiment_list
def findLanguage(reducedList3): languageMap = {} currentNumber = 0 shuffle(reducedList3) for i in reducedList3: if currentNumber < 5000: if len(i[0]) > 5: try: b = TextBlob(unicode(i[0])) currentLanguage = b.detect_language() if currentLanguage in languageMap: languageMap[currentLanguage] += 1 else: languageMap[currentLanguage] = 1 except: pass currentNumber += 1 print currentNumber listOfWords = [] for i in languageMap: for x in range(0, languageMap[i]): listOfWords.append(i) listOfWordsCounter = collections.Counter(listOfWords) print 'Best Languages:', listOfWordsCounter.most_common(5) print languageMap
def getEntities(parser, tweet, xEntities): try: spacyParsedObject = parser(tweet) sentence = TextBlob(tweet) textblobTaggedObject = sentence.parse().split() patterntaggedObject = tag(tweet, tokenize=True) for word in patterntaggedObject: word, wordtag=word if wordtag == "NNP" or wordtag == "NN" or wordtag == "PRP": v = str(word) v = v.strip() if(v not in xEntities): xEntities[v]=str(wordtag) for taggedObject in textblobTaggedObject: for word in taggedObject: word, wordtag=word[0], word[1] if wordtag == "NNP" or wordtag == "NN" or wordtag == "PRP": v = str(word) v = v.strip() if(v not in xEntities): xEntities[v]=str(wordtag) for word in spacyParsedObject: if word.tag_ == "NNP" or word.tag_ == "NN" or word.tag_ == "PRP": v = str(word) v = v.strip() if(v not in xEntities): xEntities[v]=str(word.tag_) return xEntities except Exception as e: return e
def nounize(aline): words = '' aline = TextBlob(aline.decode('ascii', errors='replace')) for word, tag in aline.tags: if tag == 'NN': word = random.choice(postnouns).strip() words = words + ' ' + word return words
def translate_this(jenni, msg): t_msg = TextBlob(msg.groups()[0]) from_lang = t_msg.detect_language() if from_lang != 'en': translated = t_msg.translate(from_lang=from_lang, to='en') jenni.reply("{}".format(translated)) else: return
def sentiment(): doob = "Great Movie!" blob = TextBlob(doob) for sentence in blob.sentences: print(sentence.sentiment) print blob.translate(to="cn")
def to_english(message, original_language=None): blob = TextBlob(text) if original_language is not None: return blob.translate(from_lang=original_language, to="en") else: return blob.translate(to="en")
def translate(phrase, from_lang, to_lang='en'): blob = TextBlob(phrase) try: translation = blob.translate(from_lang=from_lang, to=to_lang) return translation.string except: return "Sorry, no translation!"
def gen_translate(msg, fromlang, outputlang): try: blob = TextBlob(msg) # FIXME: language detection is broken. blob = blob.translate(from_lang=fromlang, to=outputlang) return str(blob) except NotTranslated: return msg
def get_tupels(text): lower = text.lower() blob = TextBlob(lower) ngrams = blob.ngrams(n=2) # assumption: don't is two words (do n't), as in "do not" # this can be easily changed by modifying the tokenizer # http://stackoverflow.com/questions/30550411 tuples = map(tuple,map(tuple, ngrams)) return tuples
def get_text(self): """ NOTE: THIS SHOULD NOT REBUILD DICT EVERY TIME -- REFACTOR """ blob = TextBlob(self.content.decode('utf-8')) words_ = blob.split() d = parser.build_ngram_dict(words_) s = parser.build_sentence(d) # TODO: add check for max text length self.text = s
def translate_pt(bot, update): text = text_replace(update.message.text) chat_id = update.message.chat_id en_blob = TextBlob(text) pt_text = en_blob.translate(to='pt-BR') return bot.sendMessage(chat_id, text=u'Tradução: %s' % unicode(pt_text))
def translate(string, lang): return "" tb = TextBlob(string) if lang != "en": try: tb = tb.translate(to="en") except: pass return str(tb)
def GetBigrams(text): blob = TextBlob(text) WordLists = blob.ngrams(n = 2) Bigrams = [] for wordlist in WordLists: cstr = '' for word in wordlist: cstr = cstr+word+"_" Bigrams.append(cstr) return Bigrams
def correctSpelling(text): ''' Correcting the spelling of the words :param text: the input text :return: corrected the spelling in the words ''' textBlob = TextBlob(text) return textBlob.correct()
fil = 'data/tweethack1.json' sleep_time = 60*5 i = 0 while i < 100000: hack_dict = {} breach_list = [] ddos_list = [] hijack_list = [] tstmp = str(datetime.datetime.now()).replace('-','').replace(' ','').split(':')[0] + str(datetime.datetime.now()).split(':')[1] try: # Twitter sentiment anlysis for word in breach_words: breach_tweets = api.search(word) for tweet in breach_tweets: analysis = TextBlob(tweet.text) sentiment = analysis.sentiment.polarity if sentiment < 0: breach_list.append(str(analysis)) for word in ddos_words: ddos_tweets = api.search(word) for tweet in ddos_tweets: analysis = TextBlob(tweet.text) sentiment = analysis.sentiment.polarity if sentiment < 0: ddos_list.append(str(analysis)) for word in hijack_words: hijack_tweets = api.search(word) for tweet in hijack_tweets:
import tweepy from textblob import TextBlob wiki = TextBlob("Vivek is always angry beacuse he can't manage his time") # print(wiki.tags) #Parts of speech # print(wiki.words) #Tokenize print(wiki.sentiment) consumer_key = 'o5CbrDAJkpCLBhHTsu3YkSsvN' consumer_secret = '2irncRv189vQTBMF3qAO5vwO4LpEHT29rH8r3nagzzvNt9IEEQ' access_token = '2996486912-b7NCHNfnISl5fsXVO0OLH4Dl7NyfnXCtxwTgsUh' access_token_secret = ' 9KJksG6vLknQs80MimZvHVoiAuYkeGaXrtUxL8Sulxkeg' auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth) public_tweets = api.search('Trump') for tweet in public_tweets: print(tweet.text) analysis = TextBlob(tweet.text) print(analysis.sentiment) print("")
def _spell_check(question): return TextBlob(question).correct()
def main(): # input_filepath = "/Users/shenjiaming/Desktop/local-embedding/SegPhrase/small/linked_results.wiki.txt" # output_filepath = "/Users/shenjiaming/Desktop/local-embedding/SegPhrase/small/linked_results.wiki.pos.tsv" input_filepath = "linked_results.wiki.txt" output_filepath = "linked_results.wiki.pos.tsv" start = time.time() np_phrase_cnt = 0 phrase_only = True with open(input_filepath, "r") as fin, open(output_filepath, "w") as fout: cnt = 0 fout.write("\t".join([ "Phrase", "Combined Score", "Phrase Quality Score", "Wiki Linking Score", "NP Count Score", "\n" ])) for line in fin: cnt += 1 if cnt % 1000 == 0: print(cnt) line = line.strip() segs = line.split("\t") phrase = segs[0] phrase_quality_score = float(segs[-1]) try: wiki_score = int(segs[1]) np_cnt_score = len(TextBlob(phrase).noun_phrases) except (ValueError, UnicodeDecodeError) as e: # import ipdb; ipdb.set_trace(); continue combined_score = math.sqrt(phrase_quality_score * (wiki_score + 1) * (np_cnt_score + 1)) fout.write("\t".join([ "_".join(phrase.split()), str(combined_score), str(phrase_quality_score), str(wiki_score), str(np_cnt_score), "\n" ])) # # # if score > 0 and phrase_quality_score >= 0.5: # if phrase_only: # fout.write("_".join(phrase.split()) + "\n") # else: # fout.write("_".join(phrase.split()) + "\t" + str(score) + "\t" + str(phrase_quality_score) + "\n") # # # if score != 0: # fout.write(line+"\n") # else: # deal with noun_phrase # tmp = TextBlob(phrase) # if len(tmp.noun_phrases) == 0: # fout.write(line+"\n") # still zero # else: # np_phrase_cnt += 1 # nps = str("|".join([ele for ele in tmp.noun_phrases])) # fout.write(phrase+"\t"+"0.5"+"\t"+nps+"\t"+segs[-1]+"\n") end = time.time() print("Number of additional noun phrases: %s" % np_phrase_cnt) print("Finish using POS Tagger for NP extraction using %s seconds" % (end - start))
print(tweet) df = pd.read_json(tweet) for index, row in df.iterrows(): test = row['text'] n = len(test) ges = test[2:n - 1] print(ges) gas = ges.strip() blob = clean_tweet(gas) hasil = stemmer.stem(blob) print(hasil) blob1 = str(hasil) tr.set_text(blob1) bersih = tr.translate() kedas = TextBlob(bersih) print(bersih) # if kedas.sentiment.polarity > 0: # test1 = 1 # kata = 'positive' # print(kata,test1) # elif kedas.sentiment.polarity < 0: # test1 = -1 # kata = 'negative' # print(kata,test1) # elif kedas.sentiment.polarity == 0.0: # test1 = 0 # kata = 'neutral' # print(kata,test1) # mongo = {
def __call__(self, text): return set(TextBlob(text).words.lemmatize().lower()).intersection(self.words)
# Parse the Status objects dates = [] polarities = [] for s in statuses: # Uncomment below to print the contents of the tweets status_text = s.text status_time = s.created_at # print '\n' + status_time # print s.text fav_count = s.favorite_count retweet_count = s.retweet_count # print "Favorite Count: " + str(fav_count) # print "Retweet Count" + retweet_count # Run sentiment analysis using TextBlob tb = TextBlob(status_text) status_polarity = tb.sentiment.polarity polarities.append(status_polarity) # Parse and format the date/time of the tweet split_time = status_time.split(" ") dt = datetime.datetime(int(split_time[5]), monthmap[split_time[1]], int(split_time[2]), 0, 0) dates.append(dt) # Create numpy arrays for dates and polarities of the tweets date_array = np.array([dt for dt in dates]) polarities_array = np.array(polarities) # Aggregate tweets that are on the same date and take average polarity
welcome = sys.argv[1] else: welcome = "How are you, Coco?" runTime = 60 ;# seconds startTime = time.time() while 1: if initialGreetings == 0: cocoBot(welcome) else: cocoBot(response) response = raw_input() chechLanguage(response) responseMsg = TextBlob(response) if initialGreetings != 1: cocoAssignsAvatar() initialGreetings = 1 print("After cocoAssignsAvatar()") itsTimeForBye = 0 for word in responseMsg.words: if word.lower() in USER_INIT_BYE: itsTimeForBye = 1 elapsed = time.time() - startTime if elapsed >= runTime : cocoWantsABreak("cocoInitBye") response = raw_input(str(user_avatar) + " >> ")
n = 6000 train_n = 5000 test_n = 1000 allwords = re.findall('\w+', open(sys.argv[1]).read()) word_list = Counter(allwords).most_common(n) m = open(sys.argv[6], "r") tags = {} for line in m: pair = line.split('\t') tags[pair[0]] = pair[1].rstrip() m.close() f1 = open(sys.argv[2], "w") f2 = open(sys.argv[3], "w") source = sys.argv[4] target = sys.argv[5] count = 0 for word in word_list: word_map = TextBlob(word[0]).translate(from_lang=source, to=target) #tag = tags[TextBlob(word[0]).tags[0][1]] word_pair = (word[0].rstrip() + " " + word_map.string + "\n") count = count + 1 if count <= train_n: f1.write(word_pair.encode('utf8')) else: f2.write(word_pair.encode('utf8')) f1.close() f2.close()
num = 11 elif (wordFinder("@oursoutheastern", line)): num = 12 elif (wordFinder("@Grambling1901", line)): num = 13 elif (wordFinder("@SouthernU_BR", line)): num = 14 elif (wordFinder("@nsula", line)): num = 15 elif (wordFinder("@LA_College", line)): num = 16 elif (wordFinder("@NichollsState", line)): num = 17 tweets_per_college[num] += 1 college_sentiment_sum[num] += TextBlob(line).sentiment.polarity update(num, line, countsCollege, sentimentsums) for t in range(0, 8): sentimentsums[t] = sentimentsums[t] / countsCollege[t] for t in range(0, 18): college_sentiment_sum[t] = college_sentiment_sum[t] / tweets_per_college[t] #prints sentiment averages for different factors: popn, rank, region, followers on twitter '''for s,c,l,h in zip (sentimentsums, countsCollege, mini, maxi): print ('%.3f %d' + str(l).rjust(5) + str(h).rjust(5)) % (s, c)''' print 'College'.rjust(25) + 'No. of tweets'.rjust( 15) + 'Average sentiment score'.rjust(30) for n, t, s in zip(college, tweets_per_college, college_sentiment_sum):
def getsent(st): if isinstance(st, str): t = TextBlob(st) return t.sentiment.polarity else: return 0
for row in reader: review= dict() review['id'] = int(row[0]) review['patient'] = row[1] review['review'] = row[2] review['clean'] = review['review'] # Remove all non-ascii characters review['clean'] = strip_non_ascii(review['clean']) # Create textblob object review['TextBlob'] = TextBlob(review['clean']) reviews.append(review) # DEVELOP MODELS for review in reviews: review['polarity'] = float(review['TextBlob'].sentiment.polarity) review['subjectivity'] = float(review['TextBlob'].sentiment.subjectivity) if review['polarity'] >= 0.1: review['sentiment'] = 'positive' elif review['polarity'] <= -0.1:
print(plt.style.available) plt.style.use("seaborn-talk" ) # _classic_test, fivethirtyeight, classic, bmh, seaborn-talk loc = plticker.MultipleLocator(base=.3) polarity = [] subjectivity = [] lines = [] polarityEqualsZero = 0 with open("./GambinoSong.txt") as f: for line in f.read().split("\n"): if line != "" and line not in lines: sentiment = TextBlob(line) if sentiment.sentiment.polarity != 0: polarity.append(sentiment.sentiment.polarity) else: polarityEqualsZero += 1 polarity.append(sentiment.sentiment.polarity) subjectivity.append(sentiment.subjectivity) lines.append(line) def plot(p, data, label, fontsize=12): p.plot(data) p.locator_params(nbins=3) p.set_xlabel("LINES", fontsize=fontsize) p.set_ylabel(label, fontsize=fontsize)
twt = pd.read_csv('twitter training data.csv', encoding = 'latin-1') twt.head() twt = twt.iloc[:1000] #nltk.download() # Sentiment analysis using Text Blob # Creating empty dataframe to store results FinalResults = pd.DataFrame() # Run Engine for i in range(0, twt.shape[0]): blob = TextBlob(twt.iloc[i,5]) temp = pd.DataFrame({'Tweets': twt.iloc[i,5], 'Polarity': blob.sentiment.polarity}, index = [0]) FinalResults = FinalResults.append(temp) FinalResults['Sentiment'] = FinalResults['Polarity'].apply(lambda x: 'Positive' if x>0 else 'Negative' if x<0 else 'Neutral') FinalResults['Sentiment'].describe() #Results: Most of the tweets are Neutral # Sentiment Analysis using Vader FinalResults_Vader = pd.DataFrame()
j += 1 k = 0 with open('output.csv', 'wb') as c: writer = csv.writer(c) writer.writerow(['Word', 'Count', 'Sentence', 'Splice', 'Polarity', 'Sentence Pol', 'Subjectivity', 'Avg Polarity', 'Avg Whole Pol', 'Avg Subjectivity','Location']) while k < SIZE: polarSum = 0 subjectSum = 0 wholeSum = 0 for spot in topWords[k].getSentenceArray(): splice = getSplice(spot) whole = sentences[spot[0]] wholePol = TextBlob(whole.decode('utf-8')).polarity polarity = TextBlob(splice.decode('utf-8')).polarity subjectivity = TextBlob(splice.decode('utf-8')).subjectivity polarSum += polarity subjectSum += subjectivity wholeSum += wholePol writer.writerow([str(topWords[k].getWord()), str(topWords[k].getCount()), sentences[spot[0]], str(splice), str(polarity), str(wholePol), str(subjectivity)]) topWords[k].setAvgPol(polarSum/topWords[k].getCount()) topWords[k].setAvgSub(subjectSum/topWords[k].getCount()) writer.writerow([" ", " ", " ", " ", " ", " ", " ", str(polarSum/topWords[k].getCount()), str(wholeSum/topWords[k].getCount()), str(subjectSum/topWords[k].getCount()), str(topWords[k].getSentenceArray())]) k += 1 c.close()
print("Scrolling complete. Data collected.\nAnalysing data...") source_data = browser.page_source # Throw your source into BeautifulSoup and start parsing! soup = bs(source_data, "html5lib") posts = soup.find_all('div', class_="scrollerItem") now = datetime.datetime.now() for div in posts: print("----------------------") div_descendants = div.descendants post = Post() for descElement in div_descendants: if descElement.name == 'h2': print(descElement.text) print(TextBlob(descElement.text).sentiment) post.title = descElement.text post.polarity = round( TextBlob(descElement.text).sentiment.polarity, 2) post.subjectivity = round( TextBlob(descElement.text).sentiment.subjectivity, 2) if descElement.name == 'a' and descElement.get( 'data-click-id') == 'body': print(descElement.get("href")) post.url = descElement.get("href") if descElement.name == 'a' and descElement.get( 'data-click-id') == 'timestamp': wordList = descElement.text.split(" ")
print("Tweet text:", tweet_data[0]['text']) for t in range(len(tweet_data)): #the length only goes to the number that you put. Let's say that the length of the list is 3, it would go automatically to 3. #With range you go through every single index. print("Tweet text: ", tweet_data[t]['text']) #the t goes there so that you go through the 'text' and over since t is not a number but a concept # Textblob sample: #tb = TextBlob("You are a brilliant computer scientist.") #print(tb.polarity) polarity = [] subjectivity = [] tweets = "" # there are only two quotation marks because we want to make tweets a string for e in range(len(tweet_data)): #irate through the text tweet_blob = TextBlob(tweet_data[e]['text']) #text to be set to a textblob polarity.append(tweet_blob.polarity) #will add the text blob and polarize it at the same time. That's why .polarity is inside the parameter. subjectivity.append(tweet_blob.subjectivity) # will add the text blob and subjectivize it at the same time. tweets = tweets + tweet_data[e]['text'] #convine all your string or tweets, will be used later in WordCloud function textbird_tb = TextBlob(tweets) undesired_words = ["hi", "bye", "interesting", "goodnight", "spider", "fear"] filtered_dictionary = {} filtered_words[words] = count for word in textbird_tb.words: if(len(word) < 2): continue elif( not word.isalpha()): continue
for line in con: if (len(line) <= 5): continue cnt += 1 obj = json.loads(line.replace('\n', '')) text = obj.get('text') text = text.replace('\t', '').replace('\n', '') if text in tweet_list: continue out = analyser.polarity_scores(text) compound = out['compound'] text_blob = TextBlob(text) polarity = text_blob.sentiment.polarity if not out['pos'] > 0.1: if out['pos'] - out['neg'] < 0: neg_count += 1 outfile.write( obj.get('id_str') + '\t' + obj.get('created_at') + '\t' + text + '\t' + str(compound) + '\t' + str(polarity) + '\t' + str(0) + '\t' + str(1) + '\t' + str(profanity.contains_profanity(text)) + '\n') tweet_list.append(text) if not out['neg'] > 0.1: if out['pos'] - out['neg'] > 0: pos_count += 1 outfile.write(
def getPolarity(text): return TextBlob(text).sentiment.polarity
def getSubjectivity(text): return TextBlob(text).sentiment.subjectivity
def n_containing(word, text_list): return sum(1 for text in text_list if word in text) def idf(word, text_list): return math.log(len(text_list) / (1 + n_containing(word, text_list))) def tf_idf(word, text, text_list): return term_frequency(word, text) * idf(word, text_list) file_num = 1 text_list = [] while True: try: with open('./doc-res/doc_' + str(file_num) + ".txt", 'r') as doc: file_num += 1 text_list.append(TextBlob(doc.read())) except FileNotFoundError: break for i, text in enumerate(text_list): print("Top words in document {}".format(i + 1)) ratings = {word: tf_idf(word, text, text_list) for word in text.words} sorted_words = sorted(ratings.items(), key=lambda x: x[1], reverse=True) for word, rating in sorted_words[:4]: print(f"Word: {word}, TF-IDF: {round(rating, 5)}")
def update(num, line, countsCollege, sentimentsums): if (num == 0 or num == 1 or num == 2 or num == 4 or num == 6 or num == 7 or num == 12 or num == 15): countsCollege[0] += 1 sentimentsums[0] += TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity < mini[0]): mini[0] = TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity > maxi[0]): maxi[0] = TextBlob(line).sentiment.polarity else: countsCollege[1] += 1 sentimentsums[1] += TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity < mini[1]): mini[1] = TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity > maxi[1]): maxi[1] = TextBlob(line).sentiment.polarity if (num == 0 or num == 1 or num == 2 or num == 3 or num == 4 or num == 5 or num == 6 or num == 12 or num == 14 or num == 15): countsCollege[2] += 1 sentimentsums[2] += TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity < mini[2]): mini[2] = TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity > maxi[2]): maxi[2] = TextBlob(line).sentiment.polarity else: countsCollege[3] += 1 sentimentsums[3] += TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity < mini[3]): mini[3] = TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity > maxi[3]): maxi[3] = TextBlob(line).sentiment.polarity if (num == 2 or num == 5 or num == 6 or num == 13 or num == 15): countsCollege[4] += 1 sentimentsums[4] += TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity < mini[4]): mini[4] = TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity > maxi[4]): maxi[4] = TextBlob(line).sentiment.polarity else: countsCollege[5] += 1 sentimentsums[5] += TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity < mini[5]): mini[5] = TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity > maxi[5]): maxi[5] = TextBlob(line).sentiment.polarity if (num == 1 or num == 4 or num == 6 or num == 2 or num == 14): countsCollege[6] += 1 sentimentsums[6] += TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity < mini[6]): mini[6] = TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity > maxi[6]): maxi[6] = TextBlob(line).sentiment.polarity else: countsCollege[7] += 1 sentimentsums[7] += TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity < mini[7]): mini[7] = TextBlob(line).sentiment.polarity if (TextBlob(line).sentiment.polarity > maxi[7]): maxi[7] = TextBlob(line).sentiment.polarity
def on_status(self, status): ''' Extract info from tweets ''' #print("ENTRÓ A: on_status()") if status.retweeted: # Avoid retweeted info, and only original tweets will be received return True # Extract attributes from each tweet id_str = status.id_str created_at = status.created_at text = self.deEmojify(status.text) # Pre-processing the text sentiment = TextBlob(text).sentiment #este es un modelo pre-entrenado que devuelve la info de sentimiemto para usarse en Twitter polarity = sentiment.polarity subjectivity = sentiment.subjectivity user_created_at = status.user.created_at #print("User created at: ",user_created_at) #print("User Location (uncleaned): ", status.user.location) user_location = self.deEmojify(status.user.location) #print("User Location (cleaned): ",user_location) #print("User description (uncleaned): ", status.user.description) user_description = self.deEmojify(status.user.description) #print("User description (cleaned): ",user_description) user_followers_count =status.user.followers_count #print("User followers count: ",user_followers_count) longitude = None #initialize latitude = None #initialize if status.coordinates:#en caso de que esta info esté disponible longitude = status.coordinates['coordinates'][0] latitude = status.coordinates['coordinates'][1] retweet_count = status.retweet_count #print("retweet_count: ",retweet_count) favorite_count = status.favorite_count #print("favorite_count: ",favorite_count) print("status.text: ", status.text) print("Long: {}, Lati: {}".format(longitude, latitude)) #importante HACER MANEJO DE ERRORES CON TRY , por ejemplo para la comexión a la base de datos # Store all data in PostgreSQL try: ''' Check if this table exits. If not, then create a new one. ''' self.engine.connect() self.mydb = self.engine.raw_connection() self.mycursor = self.mydb.cursor() sql = "INSERT INTO {} (id_str, created_at, text, polarity, subjectivity, user_created_at, user_location, user_description, user_followers_count, longitude, latitude, retweet_count, favorite_count) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)".format(settings.TABLE_NAME) #AQUI ESTOY INSERTANDO INFO A MI TABLA val = (id_str, created_at, text, polarity, subjectivity, user_created_at, user_location, \ user_description, user_followers_count, longitude, latitude, retweet_count, favorite_count) self.mycursor.execute(sql, val) self.mydb.commit() #DELETING INFO TO AVOID OVERLOADING THE DASTABASE AND JUST KEEP TRACK OF THE LATEST DAILY INFO: delete_query = ''' DELETE FROM {0} WHERE id_str IN ( SELECT id_str FROM {0} ORDER BY created_at asc LIMIT 200) AND (SELECT COUNT(*) FROM twitter2) > 9600; '''.format(settings.TABLE_NAME) self.mycursor.execute(delete_query) self.mydb.commit() self.mycursor.close() except Exception as error: print("Error inserting/deleting info into/from the twitter table: ",error) #VALIDANDO LOS TIEMPOS: if (time.time() - self.start_time) < self.limit_time: print("Working") return True #CONTINUE "ESCUCHANDO" LA INFO DE TWITTER else: print("Time Complete") return False #PARE DE "ESCUCHAR" LA INFO DE TWITTER
# -- Sentiment Analysis -- # sub_df = pd.read_csv( "/Users/FCRA/Desktop/ALL/BSI/bsi-reddit-gme/pyfiles/sub_df.csv") sent_df = sub_df[["created", "author", "title"]] # already preproc titles sub_df2 = pd.read_csv( "/Users/FCRA/Desktop/ALL/BSI/bsi-reddit-gme/sentiment_files/preproc_titles.csv" ).reset_index(drop=True) sent_df["ptitle"] = sub_df2["title"] # --- General Sentiment of Titles with TextBlob sent_df["polarity_textBlob"] = sent_df["ptitle"].apply( lambda x: TextBlob(x).sentiment.polarity) sent_df["created"] = pd.to_datetime(sent_df["created"]).dt.floor('d') daily_sent_df_textBlob = sent_df[["created", "polarity_textBlob" ]].groupby(["created"], as_index=False).mean() daily_sent_df_textBlob["z_polarity_textBlob"] = daily_sent_df_textBlob[ "polarity_textBlob"] / daily_sent_df_textBlob["polarity_textBlob"].std( axis=0) #sent_df[["ptitle", "polarity_textBlob"]].to_csv("titles_textblob.csv") # --- Sentiment using Vader and styled lexicon vader = SentimentIntensityAnalyzer() vader.lexicon.update(new_words)
def NaiveBaiyes_Sentimental(sentence): blob = TextBlob(sentence, analyzer=NaiveBayesAnalyzer()) NaiveBayes_SentimentScore=blob.sentiment.classification return NaiveBayes_SentimentScore
from textblob import TextBlob d = TextBlob('welcome to world of book') print(d.sentences) print(d.words) print(d.noun_phrases)
def score(self, text: str) -> float: # pip install textblob from textblob import TextBlob return TextBlob(text).sentiment.polarity
<th width="230px">Username</th> <th>Product Id </th> <th>Review</th> </tr> """ fo = open(b) reader = csv.reader(fo) fi = open("pro.csv") read = csv.reader(fi) cs = csv.writer(open("WEIGHT.csv", "wb")) from textblob import TextBlob for r in reader: review = TextBlob(r[7]) w = 0 if review.sentiment.subjectivity < 0.5: w = w + 0.1 else: w = w + 0 helpful = float(r[3]) outof = int(r[4]) if outof == 0: w = w + 0.1 else: if outof < 9: value = helpful * outof ratio = value / outof if ratio < 0.5:
text = BeautifulSoup(text, features="html.parser") # removes html tags text = text.get_text() # removes target=blank Markdown tags text = text.replace("{:target=\"_blank\"}", '') # removes Markdown links text = regex.sub(linkremover, '', text) # removes anything that isn't an alphabetical character and casts the remaining string to lowercase text = regex.sub(nonalphabeticalremover, ' ', text).lower() blob = TextBlob(text) sentiments.update({filename: blob.sentiment.polarity}) wordsperpost.update({filename: len(text.split())}) wordcount += len(text.split()) # nltk stemming/token magic from http://ryancompton.net/2014/06/06/statistical-features-of-infinite-jest/ tokens = nltk.word_tokenize(text) stemmer = nltk.stem.PorterStemmer() stemmed_tokens = map(lambda x: stemmer.stem(x), tokens) for token in stemmed_tokens: if token in stems: newVal = stems.get(token) + 1 stems.update({token: newVal})
def checkText(app): train = [ ('Find a retail location near you.', 'moneypak'), ('Look for a MoneyPak in the prepaid section.', 'moneypak'), ('Take it to the cashier and load it with cash.', ' moneypak'), ('To pay fine you should enter the digits MoneyPak resulting pass in the payment form and press Pay MoneyPak.', 'moneypak'), ('Amount of fine is 500 dollars.', 'moneypak'), ('After submitting the MoneyPak voucher your fine will be redeemed in hours.', 'moneypak'), ('You can settle the fine with MoneyPak vouchers.', 'moneypak'), ('As soon as the money arrives to the Treasury account your Mobile Device will be unlocked and all information will be decrypted in course of hours.', 'moneypak'), ('Fine should be paid not later than in 12 hours since current time.', 'moneypak'), ('To unlock your device and to avoid other legal consequences you are obligated to pay a release fee of 300 dollars', 'moneypak'), ('Payable through GreenDot MoneyPak.', 'moneypak'), ('load it with 300 dollars and enter the code.', 'moneypak'), ('The penalty as a base measure of punishment on you which you are obliged to pay in a current of is imposed.', 'moneypak'), ('Click to win lottery worth ten millions.', 'moneypak'), ('This device lock is aimed to stop your illegal activity.', 'threat'), ('Your Device has been blocked up for safety reasons listed below.', 'threat'), ('All the actions performed on this device are fixed.', 'threat'), ('You have been blocked to use your device. you need to pay a fee of rupees 500 to unblock.', 'threat'), ('In less than 3 days your device data will be sent to our servers and you will be blocked to use your data. To abort this you should pay a ransom of 500 rupees.', 'threat'), ('Please mind that both your personal identities and location are well identified and criminal case can be opened against you in course of hours as of commission of crimes per above Articles.', 'threat'), ('The penalty set must be paid in course of hours as of the breach.', 'threat'), ('On expiration of the term hours that follow will be used for automatic collection of data on yourself and your misconduct and criminal case will be opened against you.', 'threat'), ('This device is locked due to the violation of the federal laws of the United States of America', 'threat'), ('The messages with terroristic motives attempts in respect of political persons and also in respect of top public officials were sent from your device', 'threat'), ('According to these data you will be permitted to pay fine in State Treasury in the consequence of initiative directed on protection of cyber space in U.S.A and in doing so to seize your clerical correspondence and taking your criminal case to court for decision formulation.', 'threat'), ('In case of penalty non-redemption your case will be taken to court for the further decision formulation and determination of your criminal case.', 'threat'), ('Seize clerical correspondence taking criminal case court decision formulation', 'threat'), ('Penalty non redemption case taken court decision formulation determination criminal', 'threat'), ('For this reason your device has been locked.', 'threat'), ("Information on your location and snaphots containing your face have been uploaded on the fbi cyber crime department's datacenter.", 'threat'), ('According to these positions your actions bear criminal character and you are a criminal subject.', 'threat'), ("If you don't adhere to the instructions provided you can be jailed under cyber crime law.", 'threat'), ("Send your phone details if you want to unlock your phone.", 'threat'), ('install', 'non-threat'), ('@string', 'non-threat'), ('The government policies have been changed', 'non-threat'), ('Under supervision of FBI.U.S.A. Ministry of Interior Interpol Copyright Alliance International Cyber Security Protection Alliance.', 'non-threat'), ('You are accused of committing the crime envisaged by Article 1 of United States of America criminal law.', 'non-threat'), ('Article 1 of United States of America criminal non-threat provides for the punishment of deprivation of liberty for terms from 5 to years.', 'non-threat'), ('Article Section Cause', 'non-threat'), ('The policies of government has been changed', 'non-threat'), ('you have been restricted by government agencies to continue', 'non-threat'), ('Article 1 Section 8 Cause 8 of the Criminal Code provides for a fine of two to five hundred minimal wages or a deprivation of liberty for two to eight years.', 'non-threat'), ('Thus violating article 2 of the Criminal Code of United States of America Article 2 of the Criminal Code provides for a deprivation of liberty for four to twelve years.', 'non-threat'), ('Spam distribution on e-mail addresses of private persons containing the advertisement of illegal drugs addictive substances or psychopharmaceuticals on the territory of the USA', 'law'), ('Disconnection or disposal of the device or your attempts to unlock the device independently will be apprehended as unapproved actions interfering the execution of the law of the united states of america (read section 1509 - obstruction of court orders and section 1510 - obstruction of criminal investigations.', 'non-threat'), ('In this case and in case of penalty non-payment in a current of three calendar days from the date of this notification. The total amount of penalty will be tripled and the respective fines will be charged to the outstanding penalty.', 'non-threat'), ('In case of dissent with the indicted prosecution you have the right to challenge it in court.', 'non-threat'), ('Distribution of false information about politicians.', 'non-threat'), ('You have been subjected to violation of Copyright and Related Rights Law (Video Music Software) and illegally using or distributing copyrighted contents', 'copyright'), ('Distribution of episodes containing abuse or p**n material with participation of animals', 'p**n'), ('Distribution and keeping of p**n material with participation of children', 'p**n'), ('You are accused of viewing and storage of forbidden child pornography and zoophilia.', 'p**n'), ('You have violated World Declaration on non-proliferation of child pornography.', 'p**n'), ('You have been viewing or distributing prohibited Pornographic content (Child Pornography photos and etc were found on your device).', 'p**n'), ('Video materials of p**n content including sexual acts with children (persons up to 9 years old).', 'p**n'), ('Video materials containing the scene of child (persons up to 9 years) and animals abuse.', 'p**n'), ('Video materials are made with the help of your device contain the scenes of cruel child and animals abuse.', 'p**n'), ('As a result of full scanning of your device some suspicious files have been found.', 'p**n'), ('Your attendance of the forbidden pornographic sites has been fixed.', 'p**n'), ('Depiction of animal cruelty.', 'p**n'), ('Whoever knowingly creates sells or possesses a depiction of animal cruelty with the intention of placing that depiction in interstate or foreign commerce for commercial gain shall be fined under this title or imprisoned not more than 5 years or both.', 'p**n'), ('Certain activities relating to material constituting or containing child pornography.', 'p**n'), ] c1 = NaiveBayesClassifier(train) path = "F:\\Apktool\\%s\\res\\layout\\" % app os.chdir(path) all_files = os.listdir(path) #print(all_files) list = [] text_list = [] for i in all_files: file = open(i, "r") st = file.read() x = re.findall(r'text=\"(.*?)\"', st, re.DOTALL) y = "".join(x).replace('\n', ' ') if (y != ''): list.append(y) #print(list) for i in list: print("Text: " + i) blob = TextBlob(i, classifier=c1) sr = blob.classify() text_list.append(sr) count = 0 #print(text_list) for i in text_list: if (i == "threat"): count = count + 1 if (count >= 1): print("THREATENING TEXT PRESENT") c = 1 if (count == 0): print("Threatening Text Not Present") c = 0 file.close() return c