def find_unknown_speakers(line, known_speakers): import re ''' Tests to see if a known speaker is in a line. Prints out if not. So we can see if there's a new unknown speaker ''' speaker_is_known = False text = TextBlob(line) text = text.replace('\n', '') nws_text = text.replace(' ', '') for test_speaker in known_speakers: if text.startswith(test_speaker): speaker_is_known = True if speaker_is_known == False and nws_text != '': text = str(text) print_text = re.sub("\[.*?\]", "[]", text) print_text = re.sub("\{.*?\}", "{}", print_text) if print_text != '[]' and print_text != '{}': print(print_text) #,'|||',text
def np_swap(aye: str, bee: str) -> Tuple[str, str]: """Swap every other noun phrase.""" a, b = TextBlob(aye), TextBlob(bee) a_nps, b_nps = a.noun_phrases, b.noun_phrases i = 0 while i < len(a_nps) and i < len(b_nps): if i % 2 != 0: anp, bnp = a_nps[i], b_nps[i] a = a.replace(anp, bnp) b = b.replace(bnp, anp) i += 1 return (a.raw, b.raw)
def do_clean(line, known_speakers): speaker_is_known = False text = TextBlob(line) text = text.replace('\n', '') #nws_text = text.replace(' ','') for test_speaker in known_speakers: if text.startswith(test_speaker): speaker_is_known = True speaker = test_speaker clean_line = remove_prefix(text, speaker) return speaker, clean_line #if speaker_is_known == False and nws_text != '': # print(text) return '-', '-'
def calcSentiment(input): ''' calculate sentiment for each twitterfile and average this''' # Part of code based on master thesis of Guangxue Cao tweets_data = [] sentiment_array = [] total = 0 OneTweetTime = "" average_sentiment = 0 # load input for line in input: tweets_data.append(line) # iterate over all tweets for tweet_data in tweets_data: tweet = tweet_data["text"] # analyze tweet with TextBlob to gain sentiment tweet = TextBlob(tweet) OneTweetTime = tweet_data["created_at"] # remove empty lines tweet = tweet.replace("\n", " ") tweet = tweet.replace("\r ", " ") sentiment = tweet.sentiment.polarity sentiment_array.append(sentiment) for sentiment in sentiment_array: total += sentiment if len(sentiment_array) != 0: average_sentiment = total / len(sentiment_array) return [OneTweetTime, average_sentiment] # writer.writerow([OneTweetTime,"sentiment:", average_sentiment]) # tweet = TextBlob(tweet,analyzer=NaiveBayesAnalyzer())
def calcSentiment(input): ''' calculate sentiment for each twitterfile and average this''' # Part of code based on master thesis of Guangxue Cao tweets_data = [] sentiment_array = [] total = 0 OneTweetTime = "" average_sentiment = 0 # load input for line in input: tweets_data.append(line) # iterate over all tweets for tweet_data in tweets_data: tweet = tweet_data["text"] # analyze tweet with TextBlob to gain sentiment tweet = TextBlob(tweet) OneTweetTime = tweet_data["created_at"] # remove empty lines tweet = tweet.replace("\n", " ") tweet = tweet.replace("\r "," ") sentiment = tweet.sentiment.polarity sentiment_array.append(sentiment) for sentiment in sentiment_array: total += sentiment if len(sentiment_array) != 0: average_sentiment = total / len(sentiment_array) return [OneTweetTime, average_sentiment] # writer.writerow([OneTweetTime,"sentiment:", average_sentiment]) # tweet = TextBlob(tweet,analyzer=NaiveBayesAnalyzer())
def get_category(txt): category_counte = Counter() a = [] blob = TextBlob(str(txt)) blob = blob.replace("-", " ") for item in list(blob.noun_phrases): bob = TextBlob(item) category_counte[item] = sigmoid( (float(bob.polarity)) / ((float(bob.subjectivity)) + 1)) if (len(list(category_counte.most_common())) > 0): a = list((category_counte.most_common())[0]) return str(a[0]) else: return "others"
def pos_swap(aye: str, bee: str, pos: str) -> Tuple[int, str, str]: """Swap a given part of speech.""" # TODO: Replace instances of subword in own definition. swaps = 0 a, b = TextBlob(aye), TextBlob(bee) apos, bpos = a.pos_tags, b.pos_tags aps = [p for p in apos if p[1] == pos] bps = [p for p in bpos if p[1] == pos] for i in range(min([len(aps), len(bps)])): ap = aps[i][0] bp = bps[i][0] a = a.replace(ap, bp) b = b.replace(bp, ap) swaps += 1 return (swaps, a.raw, b.raw)
def inputNumber(message): while True: try: userInput = int(input(message)) except ValueError: print("Invalid input. Please enter a number: 1, 2, 3, or 4.") continue if userInput not in [1, 2, 3, 4]: print("Invalid integer. Please enter 1, 2, 3, or 4.") continue ############################################################################################################## #######--------CHOICE-#1:-DOCUMENT-FILE----------------------------------------------------------############## ############################################################################################################## if userInput == 1: docchoice = input("Please enter the name of the Text File.\n") sourcedoc = open(docchoice, 'r') readsource = sourcedoc.read() lowfile = readsource.lower() # filesoup = BeautifulSoup(lowfile,'lxml') # filetext = filesoup.get_text(strip = True) # sent = TextBlob(filetext) sent = TextBlob(lowfile) slashsplice = sent.replace('/', ' ') dashsplice = (slashsplice.replace('-', ' ')) dashsplice2 = (dashsplice.replace('–', ' ')) sentblob = TextBlob(lowfile) filepunct = TextBlob(str(remove_punctuation(dashsplice2))) finaltext = str(remove_punctuation(dashsplice2)) print("\n-----------------------------------------------") print("-----Sentiment Analysis Guide------------------") print("-----------------------------------------------") print( " Polarity(Emotion): \n [ -1:Negative, 0:Neutral, 1:Positive ]" ) print( "\n Subjectivity(Fact VS Opinion): \n [ 0:Objective 1:Subjective ]" ) print("------------------------------------------------") polar = sentblob.sentiment.polarity subject = sentblob.sentiment.subjectivity print("\n|------------------------------------|") print("|-----SENTIMENT ANALYSIS RESULTS-----|") print("|------------------------------------|") print("| Polarity: ", polar, " \n| Subjectivity: ", subject, " ") print("|------------------------------------|") tag_dict = {"J": 'a', "N": 'n', "V": 'v', "R": 'r'} words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in filepunct.tags] lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags] punctuate = str.maketrans('', '', string.punctuation) tokens = [w.translate(punctuate) for w in lemmatized_list] # splitpunct = filepunct.split() stoplist = stopwords.words('english') + [ 'ie', 'may', 'us', 'shall', 'etc', 'thereof', '2', '1', '0', '–', '’', '’', '“', '”' ] # tokens = [w for w in splitpunct] clean_tokens = tokens[:] for token in tokens: if token in stoplist: clean_tokens.remove(token) count = Counter(clean_tokens) print("\n-------30 MOST COMMON WORDS-------: \n") for key, value in count.most_common(30): print(" " + str(value) + " - " + key) print("\n-------FREQUENCY CHART-------:") freq = nltk.FreqDist(clean_tokens) freq.plot(15, cumulative=False) ##---------------PHRASE (1,2,3,4 WORDS) COUNTER---------------------------------------- bitokens = nltk.word_tokenize(finaltext) bgs = nltk.ngrams(bitokens, 2) fdist = nltk.FreqDist(bgs) count = fdist.most_common(10) tgs = nltk.ngrams(bitokens, 3) fdist2 = nltk.FreqDist(tgs) count2 = fdist2.most_common(10) qgs = nltk.ngrams(bitokens, 4) fdist3 = nltk.FreqDist(qgs) count3 = fdist3.most_common(10) print("\n--------COMMON PHRASES (2 WORDS)--------:\n") for (key, key2), value in count: print(" ", key, "", key2, "", "-", value) print("\n--------COMMON PHRASES (3 WORDS)--------:\n") for (key, key2, key3), value in count2: print(" ", key, "", key2, "", key3, "-", value) print("\n--------COMMON PHRASES (4 WORDS)--------:\n") for (key, key2, key3, key4), value in count3: print(" ", key, "", key2, "", key3, "", key4, "-", value) ####---------------------READABILITY INDEX--------------------########### flesh = int(textstat.flesch_reading_ease(readsource)) print("--------FLESCH-KINCLAID TEST--------\n", "\n Readability Score: ", flesh) if flesh in range(0, 30): print( " Very difficult to read. Best understood by university graduates." ) if flesh in range(31, 50): print(" Difficult to read.") if flesh in range(51, 60): print(" Fairly difficult to read.") if flesh in range(61, 70): print( " Plain English. Easily understood by 13- to 15-year-old students." ) if flesh in range(71, 80): print(" Fairly easy to read.") if flesh in range(81, 90): print(" Fairly easy to read.") if flesh in range(90, 100): print( " Very easy to read. Easily understood by an average 11-year-old student." ) print("-----------------------------------\n") ##################---END. LOOP---########################################################################################################## again = input( "\nThank you for using BTL 0.6. Run Again? [Y / N]\n") acceptable = ["Y", "y", "N", "n"] if again in ["Y", "y"]: print("What kind of document?") return inputNumber(message) if again in ["N", "n"]: quit() while again not in acceptable: print( "\nSorry, didn't catch that. Please select an option below:" ) return inputNumber(message) break ############################################################################################################## ####----------CHOICE-#2:-URL/LINK------------------------------------------------------------------------------- ############################################################################################################## if userInput == 2: webchoice = input("Please enter the URL of the website.\n") webdoc = urllib.request.urlopen(webchoice) readweb = webdoc.read() websoup = w3lib.html.remove_tags(readweb) # websoup = BeautifulSoup(readweb,'html5lib') # websoup2 = websoup.text print(websoup) lowweb = websoup.lower() websent = TextBlob(lowweb) slashsplice = websent.replace('/', ' ') dashsplice = (slashsplice.replace('-', ' ')) dashsplice2 = (dashsplice.replace('–', ' ')) dashsplice3 = (dashsplice2.replace(' – ', ' ')) pagesplice = dashsplice3.replace(' p. ', ' ') pagesplice2 = pagesplice.replace(' pp.', ' ') webpunct = TextBlob(str(remove_punctuation(pagesplice2))) finalweb = str(remove_punctuation(pagesplice2)) print("\n-----------------------------------------------") print("-----Sentiment Analysis Guide------------------") print("-----------------------------------------------") print( " Polarity(Emotion): \n [ -1:Negative, 0:Neutral, 1:Positive ]" ) print( "\n Subjectivity(Fact VS Opinion): \n [ 0:Objective 1:Subjective ]" ) print("------------------------------------------------") polar = websent.sentiment.polarity subject = websent.sentiment.subjectivity print("\n|------------------------------------|") print("|-----SENTIMENT ANALYSIS RESULTS-----|") print("|------------------------------------|") print("| Polarity: ", polar, " \n| Subjectivity: ", subject, " ") print("|------------------------------------|") tag_dict = {"J": 'a', "N": 'n', "V": 'v', "R": 'r'} words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in webpunct.tags] lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags] punctuate = str.maketrans('', '', string.punctuation) tokens = [w.translate(punctuate) for w in lemmatized_list] stoplist = stopwords.words('english') + [ 'ie', 'may', 'us', 'shall', 'etc', 'thereof', " ", 'mwparseroutput', 'wwww3org', 'xmlnshttp', 'also', '1', '0', 'svg', '2', 'jw', '’', '“', '”', 'u' ] clean_tokens = tokens[:] for token in tokens: if token in stoplist: clean_tokens.remove(token) count = Counter(clean_tokens) print("\n---------MOST COMMON WORDS---------: \n") for key, value in count.most_common(30): print(" " + key + " - " + str(value)) print("\n---------FREQUENCY CHART---------:") freq = nltk.FreqDist(clean_tokens) freq.plot(10, cumulative=False) ################################################################################################# ##---------------PHRASE (1,2,3,4) COUNTER---------------------------------------- ################################################################################### bitokens = nltk.word_tokenize(finalweb) bgs = nltk.ngrams(bitokens, 2) fdist = nltk.FreqDist(bgs) count = fdist.most_common(20) tgs = nltk.ngrams(bitokens, 3) fdist2 = nltk.FreqDist(tgs) count2 = fdist2.most_common(20) qgs = nltk.ngrams(bitokens, 4) fdist3 = nltk.FreqDist(qgs) count3 = fdist3.most_common(20) print("\n--------COMMON PHRASES (2 WORDS)--------:\n") for (key, key2), value in count: print(" ", key, "", key2, "", "-", value) print("\n--------COMMON PHRASES (3 WORDS)--------:\n") for (key, key2, key3), value in count2: print(" ", key, "", key2, "", key3, "-", value) print("\n--------COMMON PHRASES (4 WORDS)--------:\n") for (key, key2, key3, key4), value in count3: print(" ", key, "", key2, "", key3, "", key4, "-", value) ################################################################################################# ##---------------READABILITY INDEX---------------------------------------- ################################################################################### ##########---------------END LOOP---------------------############################## again = input("\nThank you for using BTL 0.6. Run Again? [Y / N]") acceptable = ["Y", "y", "N", "n"] if again in ["Y", "y"]: print("What kind of document?") return inputNumber(message) if again in ["N", "n"]: print("Bye!") quit() while again not in acceptable: print( "\nSorry, didn't catch that. Please select an option below:" ) return inputNumber(message) break ######################################################################################################################## ############--------CHOICE-#3:-MANUAL-INPUT----------######################################## ############################################################################################################ if userInput == 3: manchoice = input("Please enter your text here:\n") lowman = manchoice.lower() mansoup = BeautifulSoup(lowman, 'html5lib') mantext = mansoup.get_text(strip=True) mansent = TextBlob(mantext) sent = TextBlob(manchoice) manpunct = TextBlob(str(remove_punctuation(mansent))) finalman = str(remove_punctuation(mansent)) splitpunct = manpunct.split() stoplist = stopwords.words('english') + [ 'ie', 'may', 'us', 'shall', 'etc', 'thereof', '0', '–', '’', '“', '”', '’' ] print("\n-----------------------------------------------") print("-----Sentiment Analysis Guide------------------") print("-----------------------------------------------") print( " Polarity(Emotion): \n [ -1:Negative, 0:Neutral, 1:Positive ]" ) print( "\n Subjectivity(Fact VS Opinion): \n [ 0:Objective 1:Subjective ]" ) print("------------------------------------------------") polar = sent.sentiment.polarity subject = sent.sentiment.subjectivity print("\n|------------------------------------|") print("|-----SENTIMENT ANALYSIS RESULTS-----|") print("|------------------------------------|") print("| Polarity: ", polar, " \n| Subjectivity: ", subject, " ") print("|------------------------------------|") tag_dict = {"J": 'a', "N": 'n', "V": 'v', "R": 'r'} words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in manpunct.tags] lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags] punctuate = str.maketrans('', '', string.punctuation) # tokens = [w.translate(punctuate) for w in lemmatized_list] tokens = [w for w in splitpunct] stoplist = stopwords.words('english') + [ 'ie', 'may', 'us', 'shall', 'etc', 'thereof', '—' ] clean_tokens = tokens[:] for token in tokens: if token in stoplist: clean_tokens.remove(token) count = Counter(clean_tokens) print("\n------35 MOST COMMON WORDS------: \n") for key, value in count.most_common(35): print(" " + key + " - " + str(value)) print("\n------FREQUENCY CHART------:") freq = nltk.FreqDist(clean_tokens) freq.plot(10, cumulative=False) ################################################################################################# ##---------------PHRASE (1,2,3,4 WORDS) COUNTER---------------------------------------- ################################################################################## bitokens = nltk.word_tokenize(finalman) bgs = nltk.ngrams(bitokens, 2) fdist = nltk.FreqDist(bgs) count = fdist.most_common(10) tgs = nltk.ngrams(bitokens, 3) fdist2 = nltk.FreqDist(tgs) count2 = fdist2.most_common(10) qgs = nltk.ngrams(bitokens, 4) fdist3 = nltk.FreqDist(qgs) count3 = fdist3.most_common(10) print("\n--------COMMON PHRASES (2 WORDS)--------:\n") for (key, key2), value in count: print(" ", key, "", key2, "", "-", value) print("\n--------COMMON PHRASES (3 WORDS)--------:\n") for (key, key2, key3), value in count2: print(" ", key, "", key2, "", key3, "-", value) print("\n--------COMMON PHRASES (4 WORDS)--------:\n") for (key, key2, key3, key4), value in count3: print( " ", key, "", key2, "", key3, "", key4, "-", value, ) ######---------------READABILITY INDEX#----------------#### flesh = int(textstat.flesch_reading_ease(manchoice)) print("\n----------FLESCH-KINCLAID TEST----------:\n", "\n Readability Score: ", flesh, "\n") if flesh in range(0, 31): print( " --Very difficult to read. Best understood by university graduates.--" ) if flesh in range(31, 51): print(" --Difficult to read.--") if flesh in range(51, 61): print(" --Fairly difficult to read.--") if flesh in range(61, 71): print( " --Plain English. Easily understood by 13 to 15-year-old students.--" ) if flesh in range(71, 81): print(" --Fairly easy to read.--") if flesh in range(81, 91): print(" --Fairly easy to read.--") if flesh in range(91, 100): print( " --Very easy to read. Easily understood by an average 11-year-old student.--" ) print("\n------------------------------------------\n") again = input("\nThank you for using BTL 0.3. Run Again? [Y / N]") acceptable = ["Y", "y", "N", "n"] if again in ["Y", "y"]: print("What kind of document?") return inputNumber(message) if again in ["N", "n"]: print("Bye!") quit() while again not in acceptable: print( "\nSorry, didn't catch that. Please select an option below:" ) return inputNumber(message) break ################################################################################################################### ##########---------CHOICE 4: QUIT PROGRAM------------------------------------------------------------------------------- ###################################################################################################################### if userInput == 4: print("Thank you for using BTL 0.5. Bye!") quit() break
#Training the classifier on the body dataset with open("dataset2.json", 'r', encoding="utf-8-sig") as fp2: cl2 = NaiveBayesClassifier(fp2, format="json") #Taking the string values str1 = str(headline) headline = TextBlob(str1) body = str(body) tb_body = TextBlob(body) subjectivity = tb_body.sentiment.subjectivity subjectivity = float(subjectivity) * 100 body_classify = str(cl2.classify(body)) body = body.lower() #Finding the subjectivity headline = headline.replace('Was', '') headline = headline.replace('was', '') headline = headline.replace('’','') #Finding the tags in the sentence array = headline.tags array1 = [] #Finding the hot words for ii in array: name, tag = ii name = str(name) name = name.lower() if(tag.count('NN')>0): name = TextBlob(name) array1.append(name)
pickle.dump(cl2, f2) f2.close() print("Pickle created") #Taking the string values str1 = str(headlines) headline = TextBlob(str1) body = str(articles) tb_body = TextBlob(body) subjectivity = tb_body.sentiment.subjectivity subjectivity = float(subjectivity) * 100 body_classify = str(cl2.classify(body)) body = body.lower() #Finding the subjectivity headline = headline.replace('Was', '') headline = headline.replace('was', '') headline = headline.replace('’', '') #Finding the tags in the sentence array = headline.tags array1 = [] #Finding the hot words for ii in array: name, tag = ii name = str(name) name = name.lower() if (tag.count('NN') > 0): name = TextBlob(name) array1.append(name)
import pandas as pd from textblob import TextBlob import os data = pd.read_csv('../../gen/data-preparation/temp/parsed-data.csv', sep='\t', encoding='utf-8') data.head() for i, j in data.iterrows(): print(i) try: blob = TextBlob(j['text']) blob = blob.replace('.', ' ').replace(',', ' ').replace('#', ' ').replace('!', ' ').replace('?', ' ')\ .replace(':', ' ').replace(';', ' ').replace('&', ' ') .replace('/', ' ').replace('&', ' ') data.loc[i, 'polarity'] = blob.sentiment.polarity data.loc[i, 'subjectivity'] = blob.sentiment.subjectivity except: data.loc[i, 'polarity'] = '' data.loc[i, 'subjectivity'] = '' data.head() os.makedirs('../../gen/data-preparation/output/', exist_ok=True) data.to_csv('../../gen/data-preparation/output/dataset.csv', index=False) print('done.')
continue else: wordList = wordList + word + " " negString = '' posString = '' neutString = '' for word in bigString.split(): tb = TextBlob(word) if ('https') in tb or ('@') in tb or ("re") in tb or ("rt") in tb or (".") in tb: continue elif ('#') in tb: tb = tb.replace("#", "") if tb.polarity > 0 : str(tb) posString = posString + word + " " elif tb.polarity < 0 : str(tb) negString = negString + word + " " else: str(tb) neutString = neutString + word + " " elif len(tb) <= 3: continue else: tb = tb.replace("#", "") if tb.polarity > 0 : str(tb)
auth.set_access_token(config['access_token'], config['access_token_secret']) api = tweepy.API(auth) # Load text file filename=open("./txt/marx2.txt",'r') text=filename.readlines() text = ' '.join(text) filename.close() blob = TextBlob(text.decode('utf-8')) tags = blob.tags for blobs in blob.tags: if blobs[1] == 'NNP': wordchange = '#'+blobs[0] blob = blob.replace(blobs[0],wordchange) print "changing: " + wordchange for sentence in blob.sentences: sentence = re.sub('\#+', '#', str(sentence)) print sentence print "--" try: print "next tweet: " + str(sentence) api.update_status(sentence) time.sleep(120)#Tweet every 15 minutes except: continue #blob.translate(to="es") # 'La amenaza titular de The Blob...'
#if conn is not None: # conn.execute(sql_create_projects_table) # print("table should be created") #else: # print("Error! cannot create the database connection.") ## Basic Functionality -- Inputs Text Generates Output st.subheader( """ Basic Text Sentiment Functionality - text from any Social Media Platform """ ) opinionInput = st.text_input('Type in some text') if (st.button('Generate Sentiment!') and opinionInput != ""): opinion = TextBlob(opinionInput, analyzer=NaiveBayesAnalyzer()) st.write(opinion.sentiment) #Write insert statement here test = opinion.replace(" ", "") + " " + opinion.sentiment[0] strippedOpinion = str(opinion).replace(" ", "") resultingSentiment = str(opinion.sentiment[0]) #qry = "INSERT INTO entries(entry,sentiment) VALUES( '"+strippedOpinion+ "' , '"+resultingSentiment+"');" #conn.execute(qry) #conn.commit() else: st.write('Provide an Input!') st.subheader( """ Twitter Sentiment Functionality - queries text from Twitter Social Media Platform """ ) twitterOpinionInput = st.text_input('Enter text') if (st.button('Generate Sentiment') and opinionInput != ""): stro = "" for tweets in api.search(q=twitterOpinionInput, lang="en"):
data = pd.read_csv('../../gen/data-preparation/temp/parsed-data.csv', sep = '\t') data.head() DetectorFactory.seed = 0 analyser = SentimentIntensityAnalyzer() good_words = ['spectacular', 'good', 'great', 'best', 'goat', 'incredible', 'amazing', 'crazy', 'insane', 'fire'] delete_words = ['$', '%', '=', '»', '«', '@', ' ', '£', '§', '€', '*'] for i, j in data.iterrows(): print(i) time=0 date = str(j['created_at']) blob = TextBlob(str(j['text'.lower()])) for d in delete_words: blob = blob.replace(d, '') if 'RT' in str(j['text']): data.loc[i, 'retweet'] = True else: data.loc[i, 'retweet'] = False try: date = date.split(' ') hour = date[3].split(':') time += float(hour[0]) + float(hour[1])/60 data.loc[i, 'hour'] = time data.loc[i, 'language'] = detect(str(j['text'])) data.loc[i, 'polarity'] = blob.sentiment.polarity data.loc[i, 'subjectivity'] = blob.sentiment.subjectivity
import os from textblob import TextBlob from textblob.en import Spelling import os path = "spelling-model-weighted.txt" assert os.path.isfile(path) spelling = Spelling(path=path) MOCKDATA = "hi i dont spel" test = TextBlob(MOCKDATA) test1 = test.replace('dont', "don't") test1 = test1.replace('doesnt', "doesn't") test1 = test1.replace('didnt', "didn't") test1 = test1.replace('wont', "won't") test1 = test1.replace('wouldve', "would've") test1 = test1.replace('cant', "can't") test1 = test1.replace('couldnt', "couldn't") test1 = test1.replace('couldve', "could've") test1 = test1.replace('shouldnt', "shouldn't") test1 = test1.replace('shoulve', "shouldn've") test1 = test1.replace('mightve', "might've") test1 = test1.replace('havent', "haven't") test1 = test1.replace('lets', "let's") print(test1) for word in test1.words: print(spelling.suggest(word))
word = translator.translate(word, dest=detect).text print("recognised " + word) word = translator.translate(word, dest='en').text #translating the language to 'english' for search print("\nSearching...", word) #Working with browser driver = webdriver.Chrome() if(s_engine in ['Google', 'google']): driver.get('https://google.com') searchbox = driver.find_element_by_xpath('//*[@id="tsf"]/div[2]/div[1]/div[1]/div/div[2]/input') searchbox.send_keys(word) searchButton = driver.find_element_by_xpath('//*[@id="tsf"]/div[2]/div[1]/div[3]/center/input[1]') searchButton.click() elif(s_engine in ['YouTube', 'Youtube', 'youtube']): if(" " in searchItem): searchItem = word.replace(" ", "+") searchItem = "https://www.youtube.com/results?search_query=" + word driver.get(searchItem) elif(s_engine in ['Wikipedia', 'Wiki', 'wikipedia', 'wiki']): driver.get('https://en.wikipedia.org/wiki/Main_Page') searchbox = driver.find_element_by_xpath('//*[@id="searchInput"]') searchbox.send_keys(word) searchButton = driver.find_element_by_xpath('//*[@id="searchButton"]') searchButton.click() else: print("We couldn't recognize the search engine") print("\nExecution successful.") except Exception as e: print(e) print("Service time-out !")
# *** # ### 9.0 Successful and unsuccessful projects histogram (by count) per category # In[41]: # Loop over all the keywords in all the data.keywords and increment the counts in the appropriate counter objects from textblob import TextBlob blob = TextBlob(data.keywords[6]) # In[42]: data.keywords[6] # In[43]: blob = blob.replace("-", " ") # In[44]: blob.noun_phrases # In[45]: blob.sentiment # In[46]: def get_category(txt): category_counte = Counter()
#Append the json file with open(input_file) as input_novartis: for line in input_novartis: tweets_novartis.append(json.loads(line)) #parsing the text and date data with open(output_file, 'w', newline='') as output_novartis: writer = csv.writer(output_novartis) for tweets_novartis in tweets_novartis: tweet = tweets_novartis["full_text"] lan = tweets_novartis["created_at"] #Sentiment Analysis tweet = TextBlob(tweet) tweet = tweet.replace("\n", " ") tweet = tweet.replace("\r", " ") def set_date(self, lan): date = time.striptime(lan, '%b %d %Y ') self.date = datetime.fromtimestamp(time.mktime(date)) #Sentiment score sentiment = [tweet.sentiment.polarity] writer.writerows(zip(sentiment, [lan[4:10], lan[26:]]))
# counter+=1 #if counter>10: break if (len(text)>1): text = re.sub(r'@\w*',' ', text) text = re.sub(r'https?:\/\/.*[\r\n]*', '', text) text = re.sub(r'[\n]*', '', text) text.strip() text = text.replace(' ', ' ') text= text.replace(' ', ' ') text_blob = TextBlob(text) #removing stop words text_blob = TextBlob(text).lower() for d in stopwords.words('english'): text_blob = text_blob.replace(d.lower() + ' ', ' ') #cleaning to remove extra spaces text_blob = text_blob.replace(' ', ' ') #correcting spelling text_blob=text_blob.correct() #lemmatization text_blob=Word(text_blob).lemmatize() else: text_blob = TextBlob(text).lower() #sentiment analysis score_vader = analyser.polarity_scores(text_blob)
def genQuestion(sentence, ner): #print("ner: ",ner) """ outputs question from the given text """ time_flag = 0 word_ner_map = {} for i in range(len(ner)): word_ner_map[ner[i][0]] = ner[i][1] if ner[i][1] == "TIME" or ner[i][1] == "DATE": time_flag = 1 #print(word_ner_map) if type(sentence) is str: line = TextBlob(sentence) bucket = {} # Create an empty dictionary for i, j in enumerate(line.tags): #print(j) if j[1] not in bucket: bucket[j[1]] = i #print(bucket) question = '' l1 = ['NNP', 'VBG', 'VBZ', 'IN'] l2 = ['NNP', 'VBG', 'VBZ'] l3 = ['PRP', 'VBG', 'VBZ', 'IN'] l4 = ['PRP', 'VBG', 'VBZ'] l5 = ['PRP', 'VBG', 'VBD'] l6 = ['NNP', 'VBG', 'VBD'] l7 = ['NN', 'VBG', 'VBZ'] l8 = ['NNP', 'VBZ', 'JJ'] l9 = ['NNP', 'VBZ', 'NN'] l10 = ['NNP', 'VBZ'] l11 = ['PRP', 'VBZ'] l12 = ['NNP', 'NN', 'IN'] l13 = ['NN', 'VBZ'] # Who question generation rules l14 = ['NNP', 'VBD', 'NN'] l15 = ['NNP', 'VBZ', 'NN'] l16 = ['NNP', 'VB', 'NN'] questions = [] # With the use of conditional statements the dictionary is compared with the list created above # Question starting with WHO if all((key in bucket for key in l15)) and (word_ner_map.get( line.words[bucket['NNP']], "") == "PERSON"): question = line.replace(line.words[bucket['NNP']], "Who") + "?" questions.append(question) elif all((key in bucket for key in l16)) and (word_ner_map.get( line.words[bucket['NNP']], "") == "PERSON"): question = line.replace(line.words[bucket['NNP']], "Who") + "?" questions.append(question) elif all((key in bucket for key in l14)) and (word_ner_map.get( line.words[bucket['NNP']], "") == "PERSON"): question = line.replace(line.words[bucket['NNP']], "Who") + "?" questions.append(question) # Question starting with WHEN if all((key in bucket for key in l15)) and time_flag: start_index = bucket['VBZ'] end_index = bucket['IN'] question = "When " + line.words[bucket['VBZ']] for i in range(end_index): if i != start_index: question += (" " + line.words[i]) question += (" " + "?") questions.append(question) elif all((key in bucket for key in l14)) and time_flag: start_index = bucket['VBD'] end_index = bucket['IN'] question = "When " + line.words[bucket['VBD']] for i in range(end_index): if i != start_index: question += (" " + line.words[i]) question += (" " + "?") questions.append(question) elif all((key in bucket for key in l16)) and time_flag: start_index = bucket['VB'] end_index = bucket['IN'] question = "When " + line.words[bucket['VB']] for i in range(end_index): if i != start_index: question += (" " + line.words[i]) question += (" " + "?") questions.append(question) # Question starting with WHAT if all(key in bucket for key in l1): #'NNP', 'VBG', 'VBZ', 'IN' in sentence. question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[ bucket['NNP']] + ' ' + line.words[bucket['VBG']] + '?' questions.append(question) elif all(key in bucket for key in l2): #'NNP', 'VBG', 'VBZ' in sentence. question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[ bucket['NNP']] + ' ' + line.words[bucket['VBG']] + '?' questions.append(question) elif all(key in bucket for key in l3): #'PRP', 'VBG', 'VBZ', 'IN' in sentence. question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[ bucket['PRP']] + ' ' + line.words[bucket['VBG']] + '?' questions.append(question) elif all(key in bucket for key in l4): #'PRP', 'VBG', 'VBZ' in sentence. question = 'What ' + line.words[ bucket['PRP']] + ' ' + ' does ' + line.words[ bucket['VBG']] + ' ' + line.words[bucket['VBG']] + '?' questions.append(question) elif all(key in bucket for key in l7): #'NN', 'VBG', 'VBZ' in sentence. question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[ bucket['NN']] + ' ' + line.words[bucket['VBG']] + '?' questions.append(question) elif all(key in bucket for key in l8): #'NNP', 'VBZ', 'JJ' in sentence. question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[ bucket['NNP']] + '?' questions.append(question) elif all(key in bucket for key in l9): #'NNP', 'VBZ', 'NN' in sentence question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[ bucket['NNP']] + '?' questions.append(question) elif all(key in bucket for key in l11): #'PRP', 'VBZ' in sentence. if line.words[bucket['PRP']] in ['she', 'he']: question = 'What' + ' does ' + line.words[bucket['PRP']].lower( ) + ' ' + line.words[bucket['VBZ']].singularize() + '?' questions.append(question) elif all(key in bucket for key in l10): #'NNP', 'VBZ' in sentence. question = 'What' + ' does ' + line.words[bucket[ 'NNP']] + ' ' + line.words[bucket['VBZ']].singularize() + '?' questions.append(question) elif all(key in bucket for key in l13): #'NN', 'VBZ' in sentence. question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[ bucket['NN']] + '?' questions.append(question) # When the tags are generated 's is split to ' and s. To overcome this issue. if 'VBZ' in bucket and line.words[bucket['VBZ']] == "’": question = question.replace(" ’ ", "'s ") questions.append(question) if "because" in sentence.lower() and line.words[bucket['VBZ']]: question = 'Why ' + line.words[bucket['VBZ']] end_index = sentence.split().index("because") for i in range(end_index): if i != bucket['VBZ']: question += (" " + line.words[i]) question += (" " + "?") questions.append(question) return questions