def preprocessing(tweet, sentiment): global count tweet = re.sub('(www\.[^\s]+)', '', str(tweet)) # remove url tweet = re.sub(r'https?:\/\/.*\/\w*', '', str(tweet)) # remove hyperlink tweet = re.sub(r'&\w*', '', str(tweet)) #remove & tweet = re.sub('@[^\s]+', '', tweet) #remove @ tweet = re.sub(r'#\w*', '', str(tweet)) #remove hashtags tweet = re.sub(r'\$\w*', '', str(tweet)) # Remove tickers tweet = tweet.strip( ' ') #remove white spaces from the front and end of a string tweet = tweet.lower() # remove upper case negations_dic = { "isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not", "haven't": "have not", "hasn't": "has not", "hadn't": "had not", "won't": "will not", "wouldn't": "would not", "don't": "do not", "doesn't": "does not", "didn't": "did not", "can't": "can not", "couldn't": "could not", "shouldn't": "should not", "mightn't": "might not", "mustn't": "must not" } t = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b') tweet = t.sub(lambda x: negations_dic[x.group()], str(tweet)) tweet = re.sub('[^a-zA-Z]', ' ', str(tweet)) # take alphabet only tweet = TextBlob(tweet).correct() tweet = re.sub('[\s]+', ' ', str(tweet)) #Remove additional white spaces tweet = tweet.strip( ' ') #remove white spaces from the front and end of a string tweet = tweet.split() ps = PorterStemmer() #removal of suffices, like “ing”, “ly”, “s”, etc tweet = ' '.join(tweet) length = len(tweet.split()) if length != 0: fp2.writelines(sentiment + '\n') fp1.writelines(tweet + '\n')
def bed_availability(beds): data = html2text.html2text(requests.get(beds).text) time.sleep(2) blob = TextBlob(data) # print(blob) x = blob.split("####") # print(x) j = [i for i in x if i.startswith(" **")] extract_out = [] for hospital in j: try: contact = re.findall("[0-9]{10}", hospital)[0] except IndexError: contact = None hospital_name = hospital.split("\n")[0].replace('*', '') vacant_index = hospital.split("\n").index('Vacant') icu_vacant_index = hospital.split("\n").index('ICU Vacant') non_icu_vacant_index = hospital.split("\n").index('Non ICU Vacant') # print(vacant_index,icu_vacant_index,non_icu_vacant_index) vacant = hospital.split("\n")[vacant_index - 2].replace( '*', '').replace(' ', '').replace('_', '') icu_vacant = hospital.split("\n")[icu_vacant_index - 2].replace( '*', '').replace(' ', '').replace('_', '') non_icu_vacant = hospital.split("\n")[non_icu_vacant_index - 2].replace('*', '').replace( ' ', '').replace('_', '') # print(vacant,icu_vacant,non_icu_vacant) extract_out.append( (hospital_name, contact, int(vacant), int(icu_vacant), int(non_icu_vacant))) return extract_out
def update(table, field): conn = boto.dynamodb.connect_to_region('us-west-2', aws_access_key_id='', aws_secret_access_key='') table = conn.get_table(table) for line in table.scan(): newline = line[field] text = TextBlob(newline) text = text.lower() textwords = text.split() wordcount = 0 wordlist = [] for word in textwords: wordcount += 1 if word not in wordlist: wordlist.append(word) #handles div0 errors if wordcount == 0: lexdiv = 0 else: lexdiv = round((len(wordlist) * 1.0) / wordcount, 2) polarity = text.sentiment.polarity subjectivity = text.sentiment.subjectivity line.put_attribute('subjectivity', subjectivity) line.put_attribute('polarity', polarity) line.put_attribute('lexical diversity', lexdiv) line.save()
def get_text(self): """ NOTE: THIS SHOULD NOT REBUILD DICT EVERY TIME -- REFACTOR """ blob = TextBlob(self.content.decode('utf-8')) words_ = blob.split() d = parser.build_ngram_dict(words_) s = parser.build_sentence(d) # TODO: add check for max text length self.text = s
def clean(doc, stop_words, exclude): stop_free = " ".join([i for i in doc.lower().split() if i not in stop_words]) #print('stop free is') #print(stop_free) punc_free = ''.join(ch for ch in stop_free if ch not in exclude) blob = TextBlob(punc_free) normalized = " ".join(lemma.lemmatize(word) for word in blob.split()) return normalized
def get_best_lines(sentence): """ Takes a list of words and searches a csv file for lines that are similar Returns the line/s from csv that have most words in common Also returns the number of words in common found """ bestlines = [] blcount = 0 parsed = TextBlob(sentence) userinput = parsed.split(" ") userinput = list(set(userinput)) #remove dupe words filelines = [] priceflag, priceop, pricenum = get_filter_variables("price", sentence) cpuflag, cpuop, cpunum = get_filter_variables("cpu", sentence) try: dir_path = os.path.dirname(os.path.realpath(__file__)) #print(dir_path) fp = open(dir_path + '/newtrim.csv', 'r') line = fp.readline() count = 0 while line: if (priceflag): if (not compare_string_op(float(get_price_from_sentence(line)), float(pricenum), priceop)): line = fp.readline() continue if (cpuflag): if (not compare_string_op(float(get_cpu_from_sentence(line)), float(cpunum), cpuop)): line = fp.readline() continue filteredline = " ".join(list(set( line.split(",")))) #remove dupe words in line for word in userinput: if (re.search(word.lower(), filteredline.lower())): count += 1 if (count > blcount): blcount = count bestlines = [line] elif (count == blcount): bestlines.append(line) line = fp.readline() count = 0 finally: fp.close() ec2instances = [] for line in bestlines: ec2instances.append(get_instance_from_sentence(line)) #print(ec2instances) return list(set(ec2instances))
def dk(): word3 = TextBlob(varname1.get()) lan = word3.detect_language() lan_todict = languages.get() lan_to = lan_dict[lan_todict] word3 = word3.translate(from_lang=lan, to=lan_to) sp = word3.split() label3.configure(text=word3) varname2.set(word3)
def __init__(self,article): article = TextBlob(article) #words = [word.singularize() for word in article.words] #sentences = article.sentences words = article.split() sentences = article.split('.') self['polarity'] = article.sentiment.polarity self['subjectivity'] = article.sentiment.subjectivity word_lens = [len(word) for word in words] sentence_lens = [len(sentence.split()) for sentence in sentences] punct = [char for char in article if char in punctuation] freq_items = { 'freq_question_marks':[punct,'?',sentences],\ 'freq_exclamation_marks':[punct,'!',sentences],\ 'freq_quotation_marks':[punct,'?',sentences] } freq_items_per_thousand = { 'freq_commas':[punct,',',words], \ 'freq_semi_colons':[punct,';',words],\ 'freq_ands': [words, 'and', words],\ 'freq_buts': [words, 'but', words],\ 'freq_howevers': [words, 'however', words],\ 'freq_ifs': [words, 'if', words],\ 'freq_thats': [words, 'that', words],\ 'freq_mores': [words, 'more', words],\ 'freq_verys': [words, 'very', words] } for item,params in freq_items.items(): self[item] = self.find_freq(params[0],params[1],params[2]) for item,params in freq_items_per_thousand.items(): self[item] = self.find_freq_per_thousand(params[0],params[1],params[2]) self['article_len'] = len(words) self['type_token_ratio'] = len(set(words)) / self['article_len'] self['mean_word_len'] = np.mean(word_lens) self['mean_sentence_len'] = np.mean(sentence_lens) self['std_sentence_len'] = np.std(sentence_lens)
def tweet_processor(path, part, freq=1): myFile = pd.read_csv(path, sep=',') tweets = myFile["text"] if "May" in path: part = 1 # if "May" not in path: tweets = tweets[int(len(tweets)*(part-1)*0.5):int(len(tweets)*part*0.5)] blob = " ".join(myFile["text"]).split(" ") processed_tweets = [] compound_sent = [] print("n tweets: ",len(tweets)) sid = SentimentIntensityAnalyzer() for tweet in tweets: cleaned_tweet = p.clean(tweet.lower()) filtered_tweet= clean_tweets(cleaned_tweet) ss = sid.polarity_scores(filtered_tweet) cur_sent = [ss['neg'],ss['pos'], ss['neu'], ss['compound']] blob = TextBlob(filtered_tweet) Sentiment = blob.sentiment polarity = Sentiment.polarity subjectivity = Sentiment.subjectivity if filtered_tweet != "" and len(filtered_tweet) >2: processed_tweets.append(filtered_tweet) compound_sent.append(cur_sent) # np.savetxt("processed_tweets.csv", processed_tweets, delimiter=",", fmt='%s') compound_sent = np.asarray(compound_sent) freqs = [] print("number of words: ",len((" ".join(processed_tweets).split(" ")))) print("unique words: ",len(set(" ".join(processed_tweets).split(" ")))) if freq ==0: #Use blob counting words = set(blob.split(" ")) for word in set(blob.split(" ")): if word != "" and len(word)>2: freqs.append([word,blob.count(word)]) freqs = np.asarray(freqs) freqs = freqs[np.argsort(freqs[:, 1])][::-1] if freq ==1: #Use NLTK freqdist freqs = pfreq_dist(" ".join(processed_tweets).split(" ")) freqs = np.asarray(freqs) return processed_tweets, freqs, compound_sent
def makeGraph2(url): article = Article(url) article.download() # Parse article.parse() # nlp article.nlp() blob = TextBlob(article.text) # Seaborn # configure size of heatmap #sns.set(rc={'figure.figsize':(35,3)}) # function to visualize def visualise_sentiments(data): svm = sns.heatmap(pd.DataFrame(data).set_index("Sentence").T, center=0, annot=True, cmap="PiYG") image_object = BytesIO() figure = svm.get_figure() figure.savefig(image_object, format="PNG", facecolor="#36393E") #sns.subplots_adjust(left=0.0, bottom=0.1, right=0.45) image_object.seek(0) return image_object # visualization return visualise_sentiments({ "Sentence": ["SENTENCE"] + blob.split(), "Sentiment": [blob.sentiment.polarity] + [blob.sentiment.polarity for word in blob.split()], "Subjectivity": [blob.sentiment] + [blob.sentiment for word in blob.split()], })
def click(event=None): try: wrd3 = TextBlob(varname.get()) ln = wrd3.detect_language() lang_todict = languages.get() ln_to = lang_dict[lang_todict] wrd3 = wrd3.translate(from_lang=ln, to=ln_to) label3.configure(text=wrd3) varname1.set(wrd3) sp = wrd3.split() except: varname1.set("try another keyword")
def clean_process(text): #make lowercase clean_text = text.lower() #remove punctuation and numbers# clean_text = [ char for char in clean_text if char not in string.punctuation ] clean_text = [char for char in clean_text if char not in string.digits] clean_text = ''.join(clean_text) #remove spasi kelebihan di depan/akhir review# clean_text = clean_text.strip() #Spelling Correction# clean_text = TextBlob(clean_text).correct() #remove stopwords# clean_text = [ word for word in clean_text.split(' ') if word not in stopwords.words('english') ] clean_text = [word for word in clean_text if word not in new_stopwords] #make it whole again# clean_text = ' '.join(clean_text) #stringnya di-tokenize dulu menjadi token berupa kata (word token)# clean_text = clean_text.split() #setiap tokennya di lemmatize new_string = [] for word in clean_text: x_word = lemmatizer.lemmatize(word) new_string.append(x_word) return new_string
def translate_func(): dict1 = googletrans.LANGUAGES dict2 = {} #for i in dict1.items(): # dict2[i[1]] = i[0] #print(dict2) try: word3 = TextBlob(l1_txt.get()) lan = word3.detect_language() lan_todict = languages.get() lan_to = ln_dict[lan_todict] word3 = word3.translate(from_lang=lan, to=lan_to) sp = word3.split() var2.set(word3) except: var2.set("Try any other word or sentence.")
def main(): filename = sys.argv[1] with open(filename) as f: content = f.read() blob = TextBlob(content.decode('utf-8')) words = blob.split() d = build_ngram_dict(words) pprint(d) print() s = build_sentence(d) print(s) if s in content.decode('utf-8'): print("\nBummer! This sentence is just a copy of one in the corpus.")
def respond(sentence): """Parse the user's inbound sentence and find candidate terms that make up a best-fit response""" cleaned = preprocess_text(sentence) parsed = TextBlob(cleaned) pronoun, noun, adjective, verb = find_candidate_parts_of_speech(parsed) # If we said something about the bot and used some kind of direct noun, construct the # sentence around that, discarding the other candidates resp = check_for_comment_about_bot(pronoun, noun, adjective) #check whether asking for name if not resp: for word in parsed.words: if word == "Name" or word == "name": resp = random.choice(RESPONSES_TO_NAME) elif word == "am": resp = random.choice(GREET_WITH_NAME).format( **{'word': parsed.split()[-1]}) # If we just greeted the bot, we'll use a return greeting if not resp: resp = check_for_greeting(parsed) #any issues regarding app or service if not resp: for word in parsed.words: if word == "app" or word == "APP": resp = random.choice(APP_SERVICES_INFO) if pronoun and word == "problem" or word == "issues" or word == "issue" or word == "help": resp = random.choice(APP_PROBLEM) if not resp: # If we didn't override the final sentence, try to construct a new one: if not pronoun: resp = random.choice(NONE_RESPONSES) elif pronoun == 'I' and not verb: resp = random.choice(COMMENTS_ABOUT_SELF) else: resp = construct_response(pronoun, noun, verb) # If we got through all that with nothing, use a random response if not resp: resp = random.choice(NONE_RESPONSES) print(resp) return resp
def get_sentiment(clean_words): #joined list to string to use text blob library word_blob = ' '.join(clean_words) blob = TextBlob(word_blob) #create blob object for word in blob.split(): print(word) analysis = TextBlob(word) # determines polarity and subjectivity scores of each word print(analysis.sentiment) #categorizing words based upon sentiment value between -1 and 1 if analysis.sentiment[0] > 0: print('Positive') elif analysis.sentiment[0] < 0: print('Negative') else: print('Neutral')
def callback(ch, method, properties, body): # We pass each tweet/reddit comment to TextBlob for decoding message = TextBlob(body.decode("utf-8")) #Figure out whether it's a tweet or a reddit comment message_type = message.split()[:1] #print("***********" + message_type[0] + "***********") # Get the timestamp and the polarity result = {} result["date"] = datetime.datetime.utcnow() result["polarity"] = message.sentiment.polarity if message_type[0] == 'REDDIT': # Insert into tweet collection reddit_col.insert_one(result) else: #insert into reddit collection tweet_col.insert_one(result)
class TransformText: def __init__(self, text): self.t_textext = str(text) self.t_textmp_phrases = defaultdict(int) self.stop_list = [ "(", ")", "]", ".", "\\", "/", "[", '...', '–', ':', ';', '____', '___', '+', '/w', '>' ] self.sub_chunks = [] #calls grammar first in list is the nount-adj/verb-noun phrase #this current grammar parses sentence in some very interesting ways. #definitely worth keep this list of expressions #NPS and VPH patterns self.grammar_II = """ NNP: {<J.*|N.*>+} NMM: {<SYM>?<CD>?<N.+>+} VAN: {<V.*>?<J.*|N.*>*<HYPH>*<J.*|N.*>+} NUM: {<CD>+} """ self.grammar_III = """ NUM: {<CD>+} VPH: {<V.*|N.*>*<IN>*<V.*|N.*>+} """ # sequence to strip stop words and characters from the text # split text into string list self.t_text = TextBlob(self.t_textext) self.nlp = spacy.load('en', parser=False) #spaCY corpus https://spacy.io """******************************************************************************* The following set of functions use the TextBlob package to perform some simple text processing prcedures. Sentence chunking, basic word tokenization as well as n-grams, noun phrase chunking and a simple POS tagger. Note: I would only use these for basic processing - sentence chunking/Bi-Grams. ******************************************************************************""" def get_sentences(self): return self.t_text.sentences def get_words(self): return self.t_text.words def get_bigrams(self): return self.t_text.ngrams(n=2) def get_trigrams(self): return self.t_text.ngrams(n=3) def get_np_chunks(self, text=None): if text == None: return self.t_text.noun_phrases else: self.t_text = TextBlob(text) return self.t_text.noun_phrases def simple_tagger(self, text=None): if text == None: return self.t_text.tags else: self.t_text = TextBlob(text) return self.t_text.tags """*********************************************************************************** The following set of methods break out of Textblob and use NLTK (directly), GENSIM or home-rolled solutions. E.g. the tokenizer and dictionary methods provide a more sophisticated model for accessing and updating terms in the corpus. #full tokener currently only breaks down words, another version will include a unicode charcter input model. The TextBlob tool to split and return a parsed word list is not always accurate, or in the format we want for high performance prediction. Output can be altered by modifying the stop_list parameters. ************************************************************************************""" def full_tokener(self): #takes a textblob input, splits words in a list tmp = self.t_text.split() for j in tmp: for word in j.split(): list(word) #split list strings into list of characters tmp2 = [] # iterate through sublist items and remove if present in the stop_list for chars in list(word): if chars not in self.stop_list: #IMPORTANT adding or removing values will change the output tmp2.append(chars) joined_chars = ''.join(tmp2) else: del chars self.clean_list.append(joined_chars) return self.clean_list #method requires def sent_tokener(self): self.sent_tokens = [] for i in self.t_text.sentences: self.sent_tokens.append(str(i).split()) return self.sent_tokens def clean_sent_tokens(self, sent_list): #takes a textblob input, splits words in a list' self.clean_sent = [] for j in sent_list: list(j) #split list strings into list of characters tmp = [] # iterate through sublist items and remove if present in the stop_list for chars in list(j): if chars not in self.stop_list: #IMPORTANT adding or removing values will change the output tmp.append(chars) joined_chars = ''.join(tmp) else: del chars self.clean_sent.append(joined_chars) return self.clean_sent ########################ReGex POS Tagging####################### #baseline nltk parser def nltk_tagger(self, pros_list): """ This provides basic functionality for tagging and parsing a single document. This method takes a tokenized document or list of words with 'most' special characters removed from the file. """ self.tagged = nltk.pos_tag( pros_list) #Implements the MAXENT POS Tagger #returns a list of objects. Relevant phrase objects will have a label associated return self.tagged """####################################################################################### Spacy Chunker/POS Tool In order to use the spacy tools (space_tgger, and space_ent), you must pass raw text strings. Not lists, as in the case of the other ########################################################################################""" def space_tagger(self, text_doc): self.docs = [] doc = self.nlp(text_doc) #load text for word in doc: seq = (word.text, word.tag_) self.docs.append(seq) #self.docs.append(sents) return self.docs def space_ent(self, text_doc): self.doc_ents = [] doc = self.nlp(text_doc) #load text for word in doc.ents: seq = (word.text, word.label_) self.doc_ents.append(seq) #self.docs.append(sents) return self.doc_ents def parse(self, tagged_text): cp = nltk.RegexpParser(self.grammar_II) # self.result = cp.parse(tagged_text) return self.result """*********************************************************************************** Sentence based chunker methods - As of 3/22/2017 these produce the best results. ************************************************************************************""" def np_sub_chunks(self, result): """ Finds NP (nounphrase) leaf nodes of a chunk tree. Takes on the result list from the re_parse_chunker function. Checks to see if object in sentence tree has a label. Items without labels are ignored. """ self.clean_list = [] #uses the nltk.Tree class and tree class methods | worth reading for tree in result.subtrees(): phrases = [] #load label for each expression - labels are defined by the grammar attribute phrases.append(tree.label()) for i in tree.pos(): phrases.append(i[0][0]) self.sub_chunks.append(phrases) return self.sub_chunks def return_sub_chunks(self, lower=True): """ Process NP Chunks and returns a list of unique phrases for a given document. """ phrase_set = [] self.sent_phrases = [] tag_set = ["NMM"] [ phrase_set.append(chunk) for chunk in self.sub_chunks if chunk[0] in tag_set ] for phrase in phrase_set: if lower == False: phrase[1:] = [' '.join(phrase[1:])] else: phrase[1:] = [' '.join(phrase[1:]).lower()] x = phrase[1:] self.sent_phrases.append(x[0]) print(list(set(self.sent_phrases))) self.doc_phrases = list(set(self.sent_phrases)) return self.doc_phrases ################################ ################################ ################################ def return_chunks(self): """ Process NP Chunks and returns a list of unique phrases for a given document. """ phrase_set = [] for chunk in self.chunks: phrase = ' '.join(chunk) phrase_set.append(phrase) self.doc_phrases = list(set(phrase_set)) return self.doc_phrases #generate a data dump object to be passed to json.dumps method for saving dict list as json. def phrase_dump(self, doc_term, phrase_list, sentences): self.t_textmp_phrases.update(document=doc_term, phrases=phrase_list, sentences=sentences) return self.t_textmp_phrases """*********************************************************************************** The following set of methods are currently in experimental phase. These include LDA and Non-Parametric Bayesian Inference Models for Topic Analysis. The combination of noun phrasing/named entity extraction along with topic models can provide a baseline for creating hierachical classification tools to detect hierachies of text/conceptual relationships. ************************************************************************************""" def gen_bag_words(self, documents): self.dictionary = corpora.Dictionary(documents) self.dictionary.save( 'data_dump.dic') #Note uses a different approach than SDRCake self.corpus = [ self.dictionary.doc2bow(document) for document in documents ] return self.corpus def bayesian_topic(self): #this leverages gensim non-parametric bayesian algorithm self.model = models.HdpModel(self.corpus, id2word=self.dictionary) return self.model #IMPORTANT Method - Turns a document into an edge list - Document Terms as Network def pairwise(self, iterable): "s -> (s0,s1), (s1,s2), (s2, s3), ..." a, b = tee(iterable) next(b, None) self.pairs = zip(a, b) return self.pairs """********************************************************************************** Baseline html tag removal. Should be called all documents to ensure special characters are removed. Works relatively well for our currrent purposes. It is an older method originally supplied within the NLTK package but was removed in the most recent iterations. I saved it because it actually works very well compared to other methods I have seen. ************************************************************************************""" def clean_html(self, html): """ Remove HTML markup from the given string. :param html: the HTML string to be cleaned :type html: str :rtype: str """ self.str_html = str(html) # First we remove inline JavaScript/CSS: self.cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", self.str_html.strip()) # Then we remove html comments. This has to be done before removing regular # tags since comments can contain '>' characters. self.cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", self.cleaned) # Next we can remove the remaining tags: self.cleaned = re.sub(r"(?s)<.*?>", " ", self.cleaned) # Finally, we deal with whitespace self.cleaned = re.sub(r" ", " ", self.cleaned) self.cleaned = re.sub(r"[\s]", " ", self.cleaned) self.cleaned = re.sub(r" ", " ", self.cleaned) self.cleaned = re.sub(r" ", "\n", self.cleaned) self.clean = self.cleaned.split() for i in self.clean: if len(i) <= 1: self.clean.remove(i) else: pass self.clean = ' '.join(self.clean) return self.clean
# [2-3] ON YOUR OWN: # Using the code above for figures, create a new table that lists the top 10 most frequent words and how many times they occur in that text. import plotly.graph_objects as go fig = go.Figure(data=[ go.Table(header=dict(values=['A Scores', 'B Scores']), cells=dict(values=[[100, 90, 80, 90], [95, 85, 75, 95]])) ]) fig.show() #%% from collections import Counter split_words = blob.split() Counter = Counter(split_words) most_occur = Counter.most_common(10) ##print(most_occur) #split into two variables mostusedword, countofuse = zip(*most_occur) print(mostusedword) print(countofuse) #I tried to install "Plotly" to do this question, but I do not have the administrative #priviliges to download plotly. Thus, I can't make a table for this question, but I
if __name__=="__main__": count = 0 days = {} polarity = {} num_files = int(sys.argv[1]) #print num_files for i in range (1,num_files+1): input_file = open(sys.argv[1+i],'r') #print i for line in input_file: tweet_json = json.loads(line) tweet = TextBlob(tweet_json['text']) blob = TextBlob(tweet_json['created_at']) #print blob date = blob.split(' ') day = date[0] if day == 'Sun': day = "2016-04-03" if day == 'Mon': day = "2016-04-04" if day == 'Tue': day = "2016-04-05" if day == 'Wed': day = "2016-04-06" if day == 'Thu': day = "2016-04-07" if day == 'Fri': day = "2016-04-08" if day == 'Sat': day = "2016-04-09"
class Article_Reading: link=None url=None article=None analysis=None PharseText=None file_name='data/saved_article.csv' def __init__(self,url): self.url=url self.article = Article(self.url) self.article.download() self.article.parse() self.article.nlp() self.ParseText=TextBlob(self.article.text) def analize_total_text(self): self.analysis=TextBlob(self.article.text) print(self.analysis.polarity) def analize_by_sentence(self): for sentence in self.ParseText.sentences: print(sentence) def get_article_title(self): print(self.article.title) return self.article.title def get_article_author(self): print(self.article.authors) return self.article.authors def get_article_date(self): print(self.article.publish_date) return self.article.publish_date def get_article_summary(self): print(self.article.summary) return self.article.summary def get_article_tags(self): print(self.article.tags) return self.article.tags def save_article(self): from string import punctuation # print(punctuation) punctuation=list(punctuation) punctuation.append('\n') # for sent in self.ParseText: # print(sent) test=self.ParseText.split(' ') print(test) # tokens=[ token for token in self.ParseText if token not in punctuation ] # print(tokens) # punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~''\“\”/' # for sentence in self.ParseText.sentences: # for ele in sentence: # if ele in punc: # sentence=sentence.replace(ele,"") # print(sentence) # row_contents=[f"{self.get_article_title()}",f"{self.get_article_author()}",f"{self.get_article_date()}",f"{self.get_article_summary()}",f"{self.get_article_tags()}"] # with open(self.file_name,'a+',newline='') as write_obj: # csv_writter=writer(write_obj) # csv_writter.writerow(row_contents) # print(row_contents) pass def test(self): print("hello")
import nltk from nltk.corpus import stopwords from textblob import TextBlob from pathlib import Path from wordcloud import WordCloud import imageio nltk.download("stopwords") stops = stopwords.words("english") old_john = TextBlob(Path("book of John text.txt").read_text()) john = old_john.split() # Creates new list of words containing no machine code new_john = [jan for jan in john if jan not in stops] # This block converts the book of John into string tokens. # If any of these tokenized words match qualifies as "noun" part of speech # the noun will be sent the list, "noun_john" - a frequency distribution is constructed from the list of nouns # .most_common(x) can be used to return a tuple of "x" most common words and a counter is_noun = lambda pos: pos[:2] == 'NN' tokenized_john = nltk.word_tokenize(str(old_john)) noun_john = [ word for (word, pos) in nltk.pos_tag(tokenized_john) if is_noun(pos) ] stopwords = nltk.corpus.stopwords.words('english') john_frequency = nltk.FreqDist(w.lower() for w in noun_john if w not in stopwords) top15_johns = john_frequency.most_common(15) john_wc = " ".join([str(jee) for jee in top15_johns])
def Getsearch(self): auth = OAuthHandler(self.API_KEY, self.API_SECRET_KEY) auth.set_access_token(self.ACESS_TOKEN_KEY, self.ACESS_TOKEN_SECRET_KEY) api = tweepy.API(auth) searchTerm = str(self.search1.get()) NoOfTerms = int(self.search2.get()) tweets = tweepy.Cursor(api.search, q=searchTerm, lang="en").items(NoOfTerms) fp1 = open("tweets.csv", 'w') count = 1 xx = [] count = 1 for tweet in tweets: xx.append(tweet.text) for tweet in xx: tweet = re.sub('(www\.[^\s]+)', '', str(tweet)) # remove url tweet = re.sub(r'https?:\/\/.*\/\w*', '', str(tweet)) # remove hyperlink tweet = re.sub(r'&\w*', '', str(tweet)) #remove & tweet = re.sub('@[^\s]+', '', str(tweet)) #remove @ tweet = re.sub( r'#\w*', '', str(tweet) ) #remove hashtags tweet = re.sub(r'\$\w*', '', str(tweet)) # Remove tickers tweet = tweet.strip( ' ') #remove white spaces from the front and end of a string tweet = tweet.lower() # remove upper case negations_dic = { "isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not", "haven't": "have not", "hasn't": "has not", "hadn't": "had not", "won't": "will not", "wouldn't": "would not", "don't": "do not", "doesn't": "does not", "didn't": "did not", "can't": "can not", "couldn't": "could not", "shouldn't": "should not", "mightn't": "might not", "mustn't": "must not" } t = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b') tweet = t.sub(lambda x: negations_dic[x.group()], str(tweet)) tweet = re.sub('[^a-zA-Z]', ' ', str(tweet)) # take alphabet only tweet = TextBlob(tweet).correct() tweet = re.sub(r'\b\w{1,2}\b', '', str(tweet)) # Remove words with 2 or fewer letters tweet = re.sub('[\s]+', ' ', str(tweet)) #Remove additional white spaces tweet = tweet.strip( ' ') #remove white spaces from the front and end of a string tweet = tweet.split() ps = PorterStemmer( ) #removal of suffices, like “ing”, “ly”, “s”, etc tweet = str(' '.join(tweet)) tweetx = "\n" + tweet + "\n" self.TxtBox.insert(0.0, tweetx) count = count + 1 fp1.writelines(tweet + '\n') fp1.close()
def clean_text(self, text): text = text.lower() text = re.sub( r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text) text = re.sub( r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", "", text) if self.clean_wiki_tokens: # pictures text = re.sub(r"image:[a-zA-Z0-9]*\.jpg", " ", text) text = re.sub(r"image:[a-zA-Z0-9]*\.png", " ", text) text = re.sub(r"image:[a-zA-Z0-9]*\.gif", " ", text) text = re.sub(r"image:[a-zA-Z0-9]*\.bmp", " ", text) # css text = re.sub(r"#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3})", " ", text) text = re.sub(r"\{\|[^\}]*\|\}", " ", text) # templates text = re.sub(r"\[?\[user:.*\]", " ", text) text = re.sub(r"\[?\[user:.*\|", " ", text) text = re.sub(r"\[?\[wikipedia:.*\]", " ", text) text = re.sub(r"\[?\[wikipedia:.*\|", " ", text) text = re.sub(r"\[?\[special:.*\]", " ", text) text = re.sub(r"\[?\[special:.*\|", " ", text) text = re.sub(r"\[?\[category:.*\]", " ", text) text = re.sub(r"\[?\[category:.*\|", " ", text) # clean char type for typo, correct in self.clean_word_dict.items(): text = re.sub(typo, " " + correct + " ", text) # text = re.sub(typo, correct, text) # abbr convert text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\?", " ? ", text) text = re.sub(r"\!", " ! ", text) text = re.sub(r"\"", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) # Numeric chars text = re.sub(r'\d+', " ", text) # Get ride of punctuation (After processed with abbr, punctuation is useless) text = re.sub(r"^\w\s", "", text) # In toxic, words like fuckkkkk or fffffuck are explicit if self.convert_typo: convert_text = text.split() convert_text = [ w if "f**k" not in w else "f**k" for w in convert_text ] convert_text = [ w if "dick" not in w else "dick" for w in convert_text ] convert_text = [ w if "bitch" not in w else "bitch" for w in convert_text ] text = " ".join(convert_text) if self.stem_words: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) if self.lemmatize: text = text.split() wnl = WordNetLemmatizer() lemmed_words = [wnl.lemmatize(word) for word in text] text = " ".join(lemmed_words) if self.remove_stopwords: raise NotImplementedError if self.error_correct: text = TextBlob(text).correct() if self.count_null_words: text = text.split() for t in text: self.word_count_dict[t] += 1 text = " ".join(text) # Get ride of unnecessary blanks (when char_level == True, too many blanks may hurt the result) if not self.count_null_words and not self.stem_words and not self.lemmatize: text = " ".join(text.split()) return text
ax1.grid(zorder=1) ax1.xaxis.grid(False) plt.hist(td['duration'],range(0,4000,250),zorder=0,color = "#66B266") plt.xlabel('duration(seconds)') plt.ylabel('How many talks in that duration') plt.title('TED duration Distribution') plt.axvline(x=td['duration'].mean(),linestyle='--') plt.axvline(x=td['duration'].median(),color = '#FFFF7F',linestyle='-.') plt.legend(['mean of duration','median of duration'], loc='upper right') plt.show() # [',]# 排名前10的tag m = ['[',"'",',',']'] tags_split = [] indi_tag = [] for t in title_rank['tags']: t = t.split("'") #print(t) for i in t: if i[0] in m: t.remove(i) tags_split.append(t) title_rank['tags_split'] = tags_split for row in tags_split: for w in row: if w in indi_tag: continue else: indi_tag.append(w) tags_count = [] for t in title_rank['tags_split']: tags_count.append(len(t))
from textblob.decorators import requires_nltk_corpus from textblob.base import BaseTagger import os # importing a text: text1 = filehandle = open('corpus1.txt') text = filehandle.read() # removes \n from the text and adds each line to lines_list lines_list = text.splitlines() # joins the elements of lines_list and makes raw text text_raw = "".join(lines_list) # implement raw text to textblob blob = TextBlob(text_raw) # parsing sentences sentences = blob.split('.') # tag senteces tags = blob.tags # print the sentences print('\n\n{:-^160}'.format(' Parsed Sentences ') + '\n\n') for i in range(0, len(sentences)): print(sentences[i]) # put tags into a list list3_tags = [] for i in range(0, len(sentences)): list3_tags.append(tag(sentences[i])) tags_list_final = [] tags_final = [] # separate tags by sentence for i in range(0, (len(list3_tags) - 1)): for j in range(0, len(list3_tags[i])):
def sentiment_page(): uid = request.cookies.get("UID") location = request.cookies.get("Location") keyword = request.cookies.get("Keyword") coordDict = { "NY": (40.7829, -73.9682), "LA": (34.0522, -118.2436), "CH": (41.8781, -87.6232), "US": (39.8283, -98.5795) } if location != "US": zoom = 11 else: zoom = 5 print("Starting Analysis...") db = get_db() c = db.cursor() gmap = gmplot.GoogleMapPlotter( coordDict[location][0], coordDict[location][1], zoom, apikey='AIzaSyCEYyEKiSKuoEW20-XKL53kJ3CuySnWVbI') posNounPhrases = dict() negNounPhrases = dict() count = 0 for row in c.execute('''SELECT * FROM Tweets'''): #text is 2, lats are 3, lons are 4 tweetBlob = TextBlob(row[2]) nounList = tweetBlob.split() if keyword in nounList or keyword == "NONE": if float(tweetBlob.sentiment.polarity) > .75: gmap.marker(float(row[3]), float(row[4]), 'maroon') #print(tweetBlob.sentiment.polarity, row[2]) for i in nounList: try: posNounPhrases[i] += 1 except KeyError: posNounPhrases[i] = 1 elif float(tweetBlob.sentiment.polarity) > .5: gmap.marker(float(row[3]), float(row[4]), 'red') #print(tweetBlob.sentiment.polarity, row[2]) for i in nounList: try: posNounPhrases[i] += 1 except KeyError: posNounPhrases[i] = 1 elif float(tweetBlob.sentiment.polarity) > .25: gmap.marker(float(row[3]), float(row[4]), 'deeppink') #print(tweetBlob.sentiment.polarity, row[2]) for i in nounList: try: posNounPhrases[i] += 1 except KeyError: posNounPhrases[i] = 1 elif float(tweetBlob.sentiment.polarity) > 0: gmap.marker(float(row[3]), float(row[4]), 'pink') #print(tweetBlob.sentiment.polarity, row[2]) for i in nounList: try: posNounPhrases[i] += 1 except KeyError: posNounPhrases[i] = 1 elif float(tweetBlob.sentiment.polarity) == 0: pass #gmap.marker(float(row[3]), float(row[4]), '#FFFFFF') #print(tweetBlob.sentiment.polarity, row[2]) count += 1 elif float(tweetBlob.sentiment.polarity) > -.25: gmap.marker(float(row[3]), float(row[4]), 'lightblue') #print(tweetBlob.sentiment.polarity, row[2]) for i in nounList: #print(i) try: negNounPhrases[i] += 1 except KeyError: negNounPhrases[i] = 1 elif float(tweetBlob.sentiment.polarity) > -.5: gmap.marker(float(row[3]), float(row[4]), 'deepskyblue') #print(tweetBlob.sentiment.polarity, row[2]) for i in nounList: #print(i) try: negNounPhrases[i] += 1 except KeyError: negNounPhrases[i] = 1 elif float(tweetBlob.sentiment.polarity) > -.75: gmap.marker(float(row[3]), float(row[4]), 'cornflowerblue') #print(tweetBlob.sentiment.polarity, row[2]) for i in nounList: #print(i) try: negNounPhrases[i] += 1 except KeyError: negNounPhrases[i] = 1 elif float(tweetBlob.sentiment.polarity) > -.75: gmap.marker(float(row[3]), float(row[4]), 'darkslateblue') #print(tweetBlob.sentiment.polarity, row[2]) for i in nounList: #print(i) try: negNounPhrases[i] += 1 except KeyError: negNounPhrases[i] = 1 dir_path = os.path.dirname(os.path.realpath(__file__)) newPath = dir_path + "/templates/{0}.html".format(uid) gmap.draw(newPath) pos = [] neg = [] #print(posNounPhrases) #print(negNounPhrases) for i in posNounPhrases: pos.append((posNounPhrases[i], i)) for i in negNounPhrases: neg.append((negNounPhrases[i], i)) pos = sorted(pos, reverse=True) neg = sorted(neg, reverse=True) posStr = "" negStr = "" posCount = 0 negCount = 0 for i in range(100): try: if posCount < 10 and len(pos[i][1]) > 3: posStr += pos[i][1] + ", " posCount += 1 except IndexError: posCount = 10 try: if negCount < 10 and len(neg[i][1]) > 3: negStr += neg[i][1] + ", " negCount += 1 except IndexError: negCount = 10 print("Most positive phrases: " + posStr) print("Most negative phrases: " + negStr) print("Total tweets with neutral sentiment: " + str(count)) count = 0 flag = False flag2 = True with open(newPath, "r+") as f: data = f.readlines() for line in data: #if count==2: # f.write('<meta http-equiv="refresh" content="120; url=http://127.0.0.1:5000/SentimentPins"/>') count += 1 if flag == True and flag2 == True: flag2 = False flag = False f.write('<ul class="list-group">\n') f.write( '<li class="list-group-item">People in {0} are upset about . . .</li>\n' .format(location)) f.write( '<li class="list-group-item">{0}</li>\n'.format(posStr)) f.write('</ul>\n\n') f.write('<ul class="list-group">\n') f.write( '<li class="list-group-item">People in {0} are happy about . . .</li>\n' .format(location)) f.write( '<li class="list-group-item">{0}</li>\n'.format(negStr)) f.write('</ul>\n') if "body" in line: flag = True f.write(line) f.close() wb.open_new_tab("file://" + newPath) return redirect('/Home')
def calc_sentiment(text): blob = TextBlob(clean_tweet(text)) return [blob.sentiment.polarity,len(blob.split(" "))]
def inputNumber(message): while True: try: userInput = int(input(message)) except ValueError: print("Invalid input. Please enter a number: 1, 2, 3, or 4.") continue if userInput not in [1, 2, 3, 4]: print("Invalid integer. Please enter 1, 2, 3, or 4.") continue ############################################################################################################## #######--------CHOICE-#1:-DOCUMENT-FILE----------------------------------------------------------############## ############################################################################################################## if userInput == 1: docchoice = input("Please enter the name of the Text File.\n") sourcedoc = open(docchoice, 'r') readsource = sourcedoc.read() lowfile = readsource.lower() # filesoup = BeautifulSoup(lowfile,'lxml') # filetext = filesoup.get_text(strip = True) # sent = TextBlob(filetext) sent = TextBlob(lowfile) slashsplice = sent.replace('/', ' ') dashsplice = (slashsplice.replace('-', ' ')) dashsplice2 = (dashsplice.replace('–', ' ')) sentblob = TextBlob(lowfile) filepunct = TextBlob(str(remove_punctuation(dashsplice2))) finaltext = str(remove_punctuation(dashsplice2)) print("\n-----------------------------------------------") print("-----Sentiment Analysis Guide------------------") print("-----------------------------------------------") print( " Polarity(Emotion): \n [ -1:Negative, 0:Neutral, 1:Positive ]" ) print( "\n Subjectivity(Fact VS Opinion): \n [ 0:Objective 1:Subjective ]" ) print("------------------------------------------------") polar = sentblob.sentiment.polarity subject = sentblob.sentiment.subjectivity print("\n|------------------------------------|") print("|-----SENTIMENT ANALYSIS RESULTS-----|") print("|------------------------------------|") print("| Polarity: ", polar, " \n| Subjectivity: ", subject, " ") print("|------------------------------------|") tag_dict = {"J": 'a', "N": 'n', "V": 'v', "R": 'r'} words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in filepunct.tags] lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags] punctuate = str.maketrans('', '', string.punctuation) tokens = [w.translate(punctuate) for w in lemmatized_list] # splitpunct = filepunct.split() stoplist = stopwords.words('english') + [ 'ie', 'may', 'us', 'shall', 'etc', 'thereof', '2', '1', '0', '–', '’', '’', '“', '”' ] # tokens = [w for w in splitpunct] clean_tokens = tokens[:] for token in tokens: if token in stoplist: clean_tokens.remove(token) count = Counter(clean_tokens) print("\n-------30 MOST COMMON WORDS-------: \n") for key, value in count.most_common(30): print(" " + str(value) + " - " + key) print("\n-------FREQUENCY CHART-------:") freq = nltk.FreqDist(clean_tokens) freq.plot(15, cumulative=False) ##---------------PHRASE (1,2,3,4 WORDS) COUNTER---------------------------------------- bitokens = nltk.word_tokenize(finaltext) bgs = nltk.ngrams(bitokens, 2) fdist = nltk.FreqDist(bgs) count = fdist.most_common(10) tgs = nltk.ngrams(bitokens, 3) fdist2 = nltk.FreqDist(tgs) count2 = fdist2.most_common(10) qgs = nltk.ngrams(bitokens, 4) fdist3 = nltk.FreqDist(qgs) count3 = fdist3.most_common(10) print("\n--------COMMON PHRASES (2 WORDS)--------:\n") for (key, key2), value in count: print(" ", key, "", key2, "", "-", value) print("\n--------COMMON PHRASES (3 WORDS)--------:\n") for (key, key2, key3), value in count2: print(" ", key, "", key2, "", key3, "-", value) print("\n--------COMMON PHRASES (4 WORDS)--------:\n") for (key, key2, key3, key4), value in count3: print(" ", key, "", key2, "", key3, "", key4, "-", value) ####---------------------READABILITY INDEX--------------------########### flesh = int(textstat.flesch_reading_ease(readsource)) print("--------FLESCH-KINCLAID TEST--------\n", "\n Readability Score: ", flesh) if flesh in range(0, 30): print( " Very difficult to read. Best understood by university graduates." ) if flesh in range(31, 50): print(" Difficult to read.") if flesh in range(51, 60): print(" Fairly difficult to read.") if flesh in range(61, 70): print( " Plain English. Easily understood by 13- to 15-year-old students." ) if flesh in range(71, 80): print(" Fairly easy to read.") if flesh in range(81, 90): print(" Fairly easy to read.") if flesh in range(90, 100): print( " Very easy to read. Easily understood by an average 11-year-old student." ) print("-----------------------------------\n") ##################---END. LOOP---########################################################################################################## again = input( "\nThank you for using BTL 0.6. Run Again? [Y / N]\n") acceptable = ["Y", "y", "N", "n"] if again in ["Y", "y"]: print("What kind of document?") return inputNumber(message) if again in ["N", "n"]: quit() while again not in acceptable: print( "\nSorry, didn't catch that. Please select an option below:" ) return inputNumber(message) break ############################################################################################################## ####----------CHOICE-#2:-URL/LINK------------------------------------------------------------------------------- ############################################################################################################## if userInput == 2: webchoice = input("Please enter the URL of the website.\n") webdoc = urllib.request.urlopen(webchoice) readweb = webdoc.read() websoup = w3lib.html.remove_tags(readweb) # websoup = BeautifulSoup(readweb,'html5lib') # websoup2 = websoup.text print(websoup) lowweb = websoup.lower() websent = TextBlob(lowweb) slashsplice = websent.replace('/', ' ') dashsplice = (slashsplice.replace('-', ' ')) dashsplice2 = (dashsplice.replace('–', ' ')) dashsplice3 = (dashsplice2.replace(' – ', ' ')) pagesplice = dashsplice3.replace(' p. ', ' ') pagesplice2 = pagesplice.replace(' pp.', ' ') webpunct = TextBlob(str(remove_punctuation(pagesplice2))) finalweb = str(remove_punctuation(pagesplice2)) print("\n-----------------------------------------------") print("-----Sentiment Analysis Guide------------------") print("-----------------------------------------------") print( " Polarity(Emotion): \n [ -1:Negative, 0:Neutral, 1:Positive ]" ) print( "\n Subjectivity(Fact VS Opinion): \n [ 0:Objective 1:Subjective ]" ) print("------------------------------------------------") polar = websent.sentiment.polarity subject = websent.sentiment.subjectivity print("\n|------------------------------------|") print("|-----SENTIMENT ANALYSIS RESULTS-----|") print("|------------------------------------|") print("| Polarity: ", polar, " \n| Subjectivity: ", subject, " ") print("|------------------------------------|") tag_dict = {"J": 'a', "N": 'n', "V": 'v', "R": 'r'} words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in webpunct.tags] lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags] punctuate = str.maketrans('', '', string.punctuation) tokens = [w.translate(punctuate) for w in lemmatized_list] stoplist = stopwords.words('english') + [ 'ie', 'may', 'us', 'shall', 'etc', 'thereof', " ", 'mwparseroutput', 'wwww3org', 'xmlnshttp', 'also', '1', '0', 'svg', '2', 'jw', '’', '“', '”', 'u' ] clean_tokens = tokens[:] for token in tokens: if token in stoplist: clean_tokens.remove(token) count = Counter(clean_tokens) print("\n---------MOST COMMON WORDS---------: \n") for key, value in count.most_common(30): print(" " + key + " - " + str(value)) print("\n---------FREQUENCY CHART---------:") freq = nltk.FreqDist(clean_tokens) freq.plot(10, cumulative=False) ################################################################################################# ##---------------PHRASE (1,2,3,4) COUNTER---------------------------------------- ################################################################################### bitokens = nltk.word_tokenize(finalweb) bgs = nltk.ngrams(bitokens, 2) fdist = nltk.FreqDist(bgs) count = fdist.most_common(20) tgs = nltk.ngrams(bitokens, 3) fdist2 = nltk.FreqDist(tgs) count2 = fdist2.most_common(20) qgs = nltk.ngrams(bitokens, 4) fdist3 = nltk.FreqDist(qgs) count3 = fdist3.most_common(20) print("\n--------COMMON PHRASES (2 WORDS)--------:\n") for (key, key2), value in count: print(" ", key, "", key2, "", "-", value) print("\n--------COMMON PHRASES (3 WORDS)--------:\n") for (key, key2, key3), value in count2: print(" ", key, "", key2, "", key3, "-", value) print("\n--------COMMON PHRASES (4 WORDS)--------:\n") for (key, key2, key3, key4), value in count3: print(" ", key, "", key2, "", key3, "", key4, "-", value) ################################################################################################# ##---------------READABILITY INDEX---------------------------------------- ################################################################################### ##########---------------END LOOP---------------------############################## again = input("\nThank you for using BTL 0.6. Run Again? [Y / N]") acceptable = ["Y", "y", "N", "n"] if again in ["Y", "y"]: print("What kind of document?") return inputNumber(message) if again in ["N", "n"]: print("Bye!") quit() while again not in acceptable: print( "\nSorry, didn't catch that. Please select an option below:" ) return inputNumber(message) break ######################################################################################################################## ############--------CHOICE-#3:-MANUAL-INPUT----------######################################## ############################################################################################################ if userInput == 3: manchoice = input("Please enter your text here:\n") lowman = manchoice.lower() mansoup = BeautifulSoup(lowman, 'html5lib') mantext = mansoup.get_text(strip=True) mansent = TextBlob(mantext) sent = TextBlob(manchoice) manpunct = TextBlob(str(remove_punctuation(mansent))) finalman = str(remove_punctuation(mansent)) splitpunct = manpunct.split() stoplist = stopwords.words('english') + [ 'ie', 'may', 'us', 'shall', 'etc', 'thereof', '0', '–', '’', '“', '”', '’' ] print("\n-----------------------------------------------") print("-----Sentiment Analysis Guide------------------") print("-----------------------------------------------") print( " Polarity(Emotion): \n [ -1:Negative, 0:Neutral, 1:Positive ]" ) print( "\n Subjectivity(Fact VS Opinion): \n [ 0:Objective 1:Subjective ]" ) print("------------------------------------------------") polar = sent.sentiment.polarity subject = sent.sentiment.subjectivity print("\n|------------------------------------|") print("|-----SENTIMENT ANALYSIS RESULTS-----|") print("|------------------------------------|") print("| Polarity: ", polar, " \n| Subjectivity: ", subject, " ") print("|------------------------------------|") tag_dict = {"J": 'a', "N": 'n', "V": 'v', "R": 'r'} words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in manpunct.tags] lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags] punctuate = str.maketrans('', '', string.punctuation) # tokens = [w.translate(punctuate) for w in lemmatized_list] tokens = [w for w in splitpunct] stoplist = stopwords.words('english') + [ 'ie', 'may', 'us', 'shall', 'etc', 'thereof', '—' ] clean_tokens = tokens[:] for token in tokens: if token in stoplist: clean_tokens.remove(token) count = Counter(clean_tokens) print("\n------35 MOST COMMON WORDS------: \n") for key, value in count.most_common(35): print(" " + key + " - " + str(value)) print("\n------FREQUENCY CHART------:") freq = nltk.FreqDist(clean_tokens) freq.plot(10, cumulative=False) ################################################################################################# ##---------------PHRASE (1,2,3,4 WORDS) COUNTER---------------------------------------- ################################################################################## bitokens = nltk.word_tokenize(finalman) bgs = nltk.ngrams(bitokens, 2) fdist = nltk.FreqDist(bgs) count = fdist.most_common(10) tgs = nltk.ngrams(bitokens, 3) fdist2 = nltk.FreqDist(tgs) count2 = fdist2.most_common(10) qgs = nltk.ngrams(bitokens, 4) fdist3 = nltk.FreqDist(qgs) count3 = fdist3.most_common(10) print("\n--------COMMON PHRASES (2 WORDS)--------:\n") for (key, key2), value in count: print(" ", key, "", key2, "", "-", value) print("\n--------COMMON PHRASES (3 WORDS)--------:\n") for (key, key2, key3), value in count2: print(" ", key, "", key2, "", key3, "-", value) print("\n--------COMMON PHRASES (4 WORDS)--------:\n") for (key, key2, key3, key4), value in count3: print( " ", key, "", key2, "", key3, "", key4, "-", value, ) ######---------------READABILITY INDEX#----------------#### flesh = int(textstat.flesch_reading_ease(manchoice)) print("\n----------FLESCH-KINCLAID TEST----------:\n", "\n Readability Score: ", flesh, "\n") if flesh in range(0, 31): print( " --Very difficult to read. Best understood by university graduates.--" ) if flesh in range(31, 51): print(" --Difficult to read.--") if flesh in range(51, 61): print(" --Fairly difficult to read.--") if flesh in range(61, 71): print( " --Plain English. Easily understood by 13 to 15-year-old students.--" ) if flesh in range(71, 81): print(" --Fairly easy to read.--") if flesh in range(81, 91): print(" --Fairly easy to read.--") if flesh in range(91, 100): print( " --Very easy to read. Easily understood by an average 11-year-old student.--" ) print("\n------------------------------------------\n") again = input("\nThank you for using BTL 0.3. Run Again? [Y / N]") acceptable = ["Y", "y", "N", "n"] if again in ["Y", "y"]: print("What kind of document?") return inputNumber(message) if again in ["N", "n"]: print("Bye!") quit() while again not in acceptable: print( "\nSorry, didn't catch that. Please select an option below:" ) return inputNumber(message) break ################################################################################################################### ##########---------CHOICE 4: QUIT PROGRAM------------------------------------------------------------------------------- ###################################################################################################################### if userInput == 4: print("Thank you for using BTL 0.5. Bye!") quit() break
#textblob for spelling correction check_spel = pos_str pos_str = TextBlob(check_spel) try: if check_spel in word_vectors.vocab: pos_str = str(pos_str) # remove spaces both in the beginning and in the end of of string pos_str = re.sub("^\s+|\s+$", "", pos_str, flags=re.UNICODE) # any input that is NOT a-z, A-Z, 0-9,-,* pos_str = re.sub('[^a-zA-Z0-9-_*.]', ' ', pos_str) pos_str = re.sub(' +', ' ', re.sub('\W', ' ', pos_str)) pos_words = pos_str.split(' ') if (len(pos_words[0]) > 0): st.write('SIMILAR TO ', pos_str) df = pd.DataFrame(model.wv.most_similar(positive=pos_words, topn=10), columns=['SIMILAR_word', 'similarity']) df1 = df[['SIMILAR_word']] link_list = [] for i in df['SIMILAR_word']: word = 'https://scholar.google.nl/scholar?hl=nl&as_sdt=0%2C5&q=' + i link_list.append(word) # rename column as SIMILAR for UI