# Check out noun phrases, will be useful for frequent feature extraction comments.noun_phrases # In[9]: #compactness pruning: cleaned = list() for phrase in comments.noun_phrases: count = 0 for word in phrase.split(): # Count the number of small words and words without an English definition if len(word) <= 2 or (not Word(word).definitions): count += 1 # Only if the 'nonsensical' or short words DO NOT make up more than 40% (arbitrary) of the phrase add # it to the cleaned list, effectively pruning the ones not added. if count < len(phrase.split())*0.4: cleaned.append(phrase) print("After compactness pruning:\nFeature Size:") len(cleaned) # In[10]: for phrase in cleaned: match = list()
print(wiki.tags) print(wiki.noun_phrases) testimonial = TextBlob("Textblob is amazingly simple to use. What great fun!") print(testimonial.sentiment) zen = TextBlob("Beautiful is better than ugly. " "Explicit is better than implicit. " "Simple is better than complex.") print(zen.words) print(zen.sentences) for sentence in zen.sentences: print(sentence.sentiment) sentence = TextBlob('Use 4 spaces per indentation level.') print(sentence.words) print(sentence.words[2].singularize()) print(sentence.words[-1].pluralize()) w = Word("octopi") print(w.lemmatize()) w = Word("went") print(w.lemmatize("v")) word = Word("octopus") print(word.synsets) print(Word("hack").get_synsets(pos=VERB)) print(Word("octopus").definitions) octopus = Synset('octopus.n.02') shrimp = Synset('shrimp.n.03') print(octopus.path_similarity(shrimp)) animals = TextBlob("cat dog octopus") print(animals.words.pluralize()) b = TextBlob("I havv goood speling!") print(b.correct()) w = Word('falibility')
## Deleting less words import pandas as pd sil = pd.Series(" ".join(df["text"]).split()).value_counts()[-1000:] df["text"] = df["text"].apply( lambda x: " ".join(x for x in x.split() if x not in sil)) ## Lemmatization from textblob import Word nltk.download("wordnet") df["text"] = df["text"].apply( lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) ## TEST -TRAIN OLUŞTURMA train_x, test_x, train_y, test_y = model_selection.train_test_split( df["text"], df["label"], random_state=1) encoder = preprocessing.LabelEncoder() train_y = encoder.fit_transform(train_y) train_y = encoder.fit_transform(test_y) ##print(train_y[0:5]) ##print(test_y[0:5])
def words(text): list_digit_words = [ 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine' ] extractor = ConllExtractor() blob = TextBlob(text, np_extractor=extractor) noun_phrases = blob.noun_phrases #print(noun_phrases) print('The voicemail is from ', end='') for np in noun_phrases: npp = np.split(' ') if len(npp) <= 2: if np != 'phone number' and np != 'good day' and np != 'great day': if np not in list_digit_words: #if len(np)<=3: print(np.title() + ' ', end='') verbs = list() for word, tag in blob.tags: if tag == 'VB': verbs.append(word.lemmatize()) #print(verbs) verbs_l = list() for i in verbs: i_l = i.lower() verbs_l.append(i_l) nouns = list() #print(verbs_l) if 'please' in verbs_l: next_word = verbs_l[verbs_l.index('please') + 1] print("Please" + ' ' + next_word + '\n', end='') if next_word == 'call': print(' regarding ', end='') for word, tag in blob.tags: if tag == 'NN': nouns.append(word.lemmatize()) #print(nouns) num = len(nouns) for item in random.sample(nouns, num): word = Word(item) if (word != 'phone' and word != 'number' and word != 'name'): #if (word!='number'): print(word, end=' ') else: print('Please \n ', end='') for verb in verbs_l: print(verb, end=' ') for word, tag in blob.tags: if tag == 'NN': nouns.append(word.lemmatize()) #print(nouns) #print("\nThe voicemail is about ", end='') num = len(nouns) for item in random.sample(nouns, num): word = Word(item) if (word != 'phone' and word != 'number' and word != 'name'): if word not in list_digit_words: #if (word!='number'): print(word, end=' ')
from textblob import TextBlob from textblob import Word s1 = " The Game Of Thrones" s2 = " Winter is comming in thrones" s3 = " White walker are comming to kill all" d1 = TextBlob(s1) d2 = TextBlob(s2) d3 = TextBlob(s3) word = Word("thrones") doclist = [d1, d2, d3] for doc in doclist: wordlist = doc.words #noofwords=len(wordlist); for word in wordlist: tf = doc.words.count(word) print(tf) print(word + "occur" + str(tf) + "times") #ntf=tf/noofwords; #print(ntf)
from textblob.wordnet import VERB raw_query = "Physics is a better subject to study than Mathematics. I like Physics more than I like Mathematics. Physicists are more intelligent than Mathematicians." # Get input ready for use query = TextBlob(raw_query) print 'Query: ', query tags = query.tags print 'Tags: ', tags nouns = query.noun_phrases print 'Nouns: ', nouns sentiment = query.sentiment print 'Sentiment: ', sentiment words = query.words print 'Words: ', words sentences = query.sentences print 'Sentences: ', sentences parse = query.parse() print 'Parse: ', parse language = query.detect_language() print 'Language: ', language # TODO : add spelling checks to correct the input sentences for better searches corrected = query.correct() print 'Corrected: ', corrected # Search for results w = Word('Octopus') print '\nSynsets: ', w.synsets print '\nDefinitions: ', w.definitions print Word("hack").get_synsets(pos=VERB)
pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]' text = re.sub(pattern, '', text) return text print( remove_special_characters("Well this was fun! What do you think? 123#@!", remove_digits=True)) ########################################### ######## STEP 7. Correct Spelling ######### ########################################### from textblob import Word w = Word('fianlly') print(w.correct()) w = Word('flaot') print(w.spellcheck()) ########################################### ############ STEP 8. Stemming ############# ########################################### # there are several stemmers: PorterStemmer, LacasterStemmer etc # I will go with SS because I know it from nltk.stem import SnowballStemmer ss = SnowballStemmer("english") print(ss.stem("jumping"))
string = re.sub(r"\?", "", string) string = re.sub(r"'", "", string) string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"[0-9]\w+|[0-9]", "", string) string = re.sub(r"\s{2,}", " ", string) return string.strip().lower() data = pd.read_csv('../dataset/dataset.csv') x = data['news'].tolist() y = data['type'].tolist() for index, value in enumerate(x): print("processing data:", index) x[index] = ' '.join( [Word(word).lemmatize() for word in clean_str(value).split()]) vect = TfidfVectorizer(stop_words='english', min_df=2) X = vect.fit_transform(x) Y = np.array(y) print("no of features extracted:", X.shape[1]) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42) print("train size:", X_train.shape) print("test size:", X_test.shape)
Tokenization refers to dividing the text into a sequence of words or sentences. In our example, we have used the textblob library to first transform our tweets into a blob and then converted them into a series of words. """ TextBlob(combi['tweet2'][1]).words import nltk nltk.download('wordnet') """#Lemmatization Lemmatization is a more effective option than stemming because it converts the word into its root word, rather than just stripping the suffices. It makes use of the vocabulary and does a morphological analysis to obtain the root word. Therefore, we usually prefer using lemmatization over stemming. """ from textblob import Word combi['tweet3'] = combi['tweet2'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) combi['tweet3'].head() from wordcloud import WordCloud text = combi['tweet3'].to_string() wordcloud = WordCloud(width=1600, height=800,max_font_size=200).generate(text) plt.figure(figsize=(12,10)) plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off") plt.show() neg_tweets = combi[combi.sentiment == 0 ] neg_string = [] for t in neg_tweets.tweet3:
object_synsets = json.load(open(config.object_synset_path, 'r')) attribute_synsets = json.load(open(config.attribute_synset_path, 'r')) # synsets['crisp'] = 'chip.n.04' synsets = {} for key, val in object_synsets.items(): synsets[key] = val for key, val in attribute_synsets.items(): if key not in synsets: synsets[key] = val answer_dict = cPickle.load(open(config.answer_dict_path, 'rb')) vocab_with_synset = [] for v in answer_dict['vocab']: v_w = Word('_'.join(v.split())) if len(v_w.synsets) > 0: synsets[v] = v_w.synsets[0].name() if v in synsets: vocab_with_synset.append(v) hypernym_set = set() vocab_hypernyms = {} max_depth = defaultdict(int) for v in tqdm(vocab_with_synset, desc='make hypernymset'): # Synset('chip.n.04') word_synset = wn.synset(synsets[v]) # set([Synset('nutriment.n.01'), Synset('food.n.01'), Synset('physical_entity.n.01'), Synset('entity.n.01'), Synset('matter.n.03'), Synset('dish.n.02'), Synset('snack_food.n.01'), Synset('substance.n.07'), Synset('chip.n.04')]) hypernyms = [] hypernym2distance = {} for hypernym, distance in list(word_synset.hypernym_distances()):
data = data.drop(data[data.sentiment == 'hate'].index) data = data.drop(data[data.sentiment == 'neutral'].index) data = data.drop(data[data.sentiment == 'worry'].index) # Making all letters lowercase data['content'] = data['content'].apply(lambda x: " ".join(x.lower() for x in x.split())) # Removing Punctuation, Symbols data['content'] = data['content'].str.replace('[^\w\s]',' ') # Removing Stop Words using NLTK stop = stopwords.words('english') data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop)) #Lemmatisation data['content'] = data['content'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) #Correcting Letter Repetitions def de_repeat(text): pattern = re.compile(r"(.)\1{2,}") return pattern.sub(r"\1\1", text) data['content'] = data['content'].apply(lambda x: " ".join(de_repeat(x) for x in x.split())) # Code to find the top 10,000 rarest words appearing in the data freq = pd.Series(' '.join(data['content']).split()).value_counts()[-10000:] # Removing all those rarely appearing words from the data freq = list(freq.index) data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
def Lemmatize(data): data['Requirement'] = data['Requirement'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) return data
from pathlib import Path from textblob import TextBlob blob = TextBlob(Path('RomeoAndJuliet.txt').read_text() ) #creates the entire novel into an object #print(blob.words.count('joy')) #count the word 'joy' in the text of R&J #print(blob.noun_phrases.count('lady capulet')) #46 times mentioned, the phrase "LC" shows up #synonymns and antonymns of words from textblob import Word happy = Word('happy') #print(happy.definitions) #made the dictionary, gives definitions print(happy.synsets)
def export_language_file(csvfile_lang): words = pd.read_csv(csvfile_lang) wordlist = list(words['word'].values) for cc,the_word in enumerate(wordlist): # We want to look for the root word first. # If we don't find anything, # try removing any common suffixes. found_result = False # ------------ # Original word browser = mechanize.Browser() response = browser.open('http://www.etymonline.com/') browser.select_form(nr=0) try: browser['search'] = the_word except TypeError: pass resp = browser.submit() html_doc = resp.read() # now use beautifulsoup to go through resp.read() soup = BeautifulSoup(html_doc) matching_words = {} # dt = dictionary term dts = soup.find_all('dt') # dd = dictionary definition dds = soup.find_all('dd') # Look for original word # in each search result # (searching for original word) for dt,dd in zip(dts,dds): # get the text of the term # in the form: # this (n.) # that (n.) # foo (v.) # bar (adj.) this_word_full = dt.get_text() # remove (n./adj./v.) this_word = this_word_full.split(' ')[:-2] this_word = ''.join(this_word) if the_word.lower()==this_word.lower(): # We have an exact match! # key is the_word # or # key is this_word # get etymology from the dd tag # corresponding to our dt tag etym = dd.get_text() # etymology is the value matching_words[this_word] = etym # yay! found_result = True break # Now check for common suffixes... if found_result == False: # pick out synonyms from the synset that are basically the same word (share first 3 letters) # (and also strip them of part of speech, # n., v., and so on...) # # synsets look like: # swing.n.04 # # so use .split('.')[0] (first token before .) # try: synset = Word(the_word).synsets except: synset = None synset_strings = [syn.name().split('.')[0] for syn in synset] final_synset = [] for ss,sss in zip(synset,synset_strings): if sss[:3] == the_word[:3]: final_synset.append(sss) final_synset = list(set(final_synset)) # Look for synonyms # in each search result # (searching for each synonym) for dt,dd in zip(dts,dds): # get the text of the term this_word_full = dt.get_text() # remove (n./adj./v.) this_word = this_word_full.split(' ')[:-2] this_word = ''.join(this_word) for syn in final_synset: if syn.lower() == this_word.lower(): # We have an exact match! # key is the_word # or # key is this_word # get etymology from the dd tag # corresponding to our dt tag etym = dd.get_text() # etymology is the value matching_words[this_word] = etym # yay! found_result = True break # if found_result is False, # FAIL...!!! # we are skipping this word # # # but otherwise... # we extract etymology info! if found_result: # remember: # this_word and the_word are all lowercase... # just to avoid confusion. # # avoid false positives... if the_word in matching_words.keys(): etymology = matching_words[the_word] # create a grid with location (in etymology) of each language's reference. ## old way (incorrect, 'Germanic' always flags 'German' too) #etymology_grid = [etymology.index(lang) if lang in etymology else -1 for lang in languages] etymology_grid = [] for lang in languages: # need to do re # need to compile these in a list m = re.search(lang+r'\b', etymology) # double check if N is actually Old N n = re.search('Old '+lang+r'\b', etymology) if (m and not n): etymology_grid.append( m.start() ) else: etymology_grid.append(-1) # each word is tagged with whatever language # is referenced FIRST in the etymology # get whole grid etymology_grid_gt0 = [eg for eg in etymology_grid if eg >= 0 ] # check whether there IS a result if len(etymology_grid_gt0)>0: # etymology_grid_gt0 contains the ranked order of each language. min means it comes first. # # (Pdb) p etymology_grid # [-1, 239, 227, -1, 188, 184, 71, 71, 138, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1] # # (Pdb) etymology_grid_gt0 # [239, 227, 188, 184, 71, 71, 138, 0] # # (Pdb) p [languages[ii] for ii in range(len(etymology_grid)) if etymology_grid[ii] > 0] # ['Greek', 'Latin', 'Norse', 'Old Norse', 'German', 'Dutch'] # sorted_etymology_grid = sorted(etymology_grid_gt0) ranking = [] for ii,et in enumerate(etymology_grid): if et>=0: val = sorted_etymology_grid.index(et) else: val = -1 ranking.append(val) ranked_langs = [languages[r] for r in ranking if r > -1] try: language1_name = ranked_langs[0] except: language1_name = '' try: language2_name = ranked_langs[1] except: language2_name = '' #first_lang_ref = min(etymology_grid_gt0) #last_lang_ref = max(etymology_grid_gt0) print "" print "Tagging word %d of %d: %s"%(cc,len(wordlist),the_word) print the_word,":",language1_name words.loc[words['word']==the_word,'root language'] = language1_name words.loc[words['word']==the_word,'second language'] = language2_name words.loc[words['word']==the_word,'ranked languages'] = ",".join(ranked_langs) if cc%50==0: print "Exporting to file..." words.to_csv(csvfile_lang,na_rep="") print "done" print "Exporting to file..." words.to_csv(csvfile_lang,na_rep="") print "done"
data.head() stop = stopwords.words('english') data['text'] = data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop)) data1['text'] = data1['text'].apply(lambda x: " ".join(x.lower() for x in x.split())) data.head() #####Removing pantuation data['text'] = data['text'].str.replace('[^\w\s]','') data1['text'] = data1['text'].str.replace('[^\w\s]','') nltk.download('wordnet') from textblob import Word data['text'] = data['text'] .apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) data.head() data1['text'] = data1['text'] .apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) data1.head() ##Remove special characters import re data['text']=data['text'].apply(lambda x :re.sub(r'\W+', ' ', x)) data1['text']=data1['text'].apply(lambda x :re.sub(r'\W+', ' ', x)) data.head() ####Remove numbers data['text']=data['text'].str.replace('\d+', '')
# Section 11.2.8 snippets from textblob import Word index = Word('index') index.pluralize() cacti = Word('cacti') cacti.singularize() from textblob import TextBlob animals = TextBlob('dog cat fish bird').words animals.pluralize() ########################################################################## # (C) Copyright 2019 by Deitel & Associates, Inc. and # # Pearson Education, Inc. All Rights Reserved. # # # # DISCLAIMER: The authors and publisher of this book have used their # # best efforts in preparing the book. These efforts include the # # development, research, and testing of the theories and programs # # to determine their effectiveness. The authors and publisher make # # no warranty of any kind, expressed or implied, with regard to these # # programs or to the documentation contained in these books. The authors # # and publisher shall not be liable in any event for incidental or # # consequential damages in connection with, or arising out of, the #
async def meaning(self, ctx, defword): "defines a word" defenition_w = Word(defword).definitions await ctx.send(defenition_w)
def analyze_querydata(read_file, write_file): count = 0 msg = "" error = "" start = time.time() filename = read_file filename2 = write_file alltweets = csv.reader(open(filename, 'r')) with open(filename2, 'w', newline='') as analyzetweets: writer = csv.writer(analyzetweets) blob_words = set() # while True: for row in alltweets: count = count + 1 print(str(count)) try: tweet_string = row[1] lang_predict = multinomial.predict(tweet_string) blob = TextBlob(tweet_string) if lang_predict != 'ENGLISH': received_text2 = blob.translate(to='en') else: received_text2 = blob pass except urllib.error.URLError as urlE: error = urlE print(error) received_text = tweet_string number_of_tokens = len(list(blob.words)) number_of_tokens = number_of_tokens time_created = row[0] blob_words = list(blob.words) writer.writerow([count, error, time_created, received_text, number_of_tokens, blob_words]) except socket.timeout as socketE: error = socketE print(error) received_text = tweet_string number_of_tokens = len(list(blob.words)) number_of_tokens = number_of_tokens time_created = row[0] blob_words = list(blob.words) writer.writerow([count, error, time_created, received_text, number_of_tokens, blob_words]) except exceptions.NotTranslated as NT: error = NT print(error) received_text = tweet_string number_of_tokens = len(list(blob.words)) number_of_tokens = number_of_tokens time_created = row[0] blob_words = list(blob.words) writer.writerow([count, error, time_created, received_text, number_of_tokens, blob_words]) except exceptions.TranslatorError as TE: error = TE print(error) received_text = tweet_string number_of_tokens = len(list(blob.words)) number_of_tokens = number_of_tokens time_created = row[0] blob_words = list(blob.words) writer.writerow([count, error, time_created, received_text, number_of_tokens, blob_words]) else: # using textBlob for NLP of languages other than Malay blob_sentiment, blob_subjectivity = received_text2.sentiment.polarity, received_text2.sentiment.subjectivity textblob_sentiment = '' if blob.sentiment.polarity > 0: textblob_sentiment = 'positive' elif blob.sentiment.polarity < 0: textblob_sentiment = 'negative' elif blob.sentiment.polarity == 0.0: textblob_sentiment = 'neutral' number_of_tokens = len(list(blob.words)) # Extracting Main Points nouns = list() for word, tag in received_text2.tags: if tag == 'NN': nouns.append(word.lemmatize()) len_of_words = len(nouns) rand_words = random.sample(nouns, len(nouns)) final_word = list() for item in rand_words: word = Word(item).pluralize() final_word.append(word) summary = final_word end = time.time() final_time = end-start received_text = tweet_string number_of_tokens = number_of_tokens time_created = row[0] blob_words = list(blob.words) blob_sentiment = blob_sentiment blob_subjectivity = blob_subjectivity summary = summary final_time = final_time msg = "pass" # using Malaya for Malay lang tweet if lang_predict == 'MALAY': malaya_sentiment = bayes_sentiment.predict(tweet_string) malaya_proba = bayes_sentiment.predict(tweet_string, get_proba=True) proba_value = malaya_proba.get(malaya_sentiment) writer.writerow([count, msg, time_created, received_text, number_of_tokens, blob_words, malaya_sentiment, proba_value, blob_subjectivity, summary, final_time]) else: writer.writerow([count, msg, time_created, received_text, number_of_tokens, blob_words, textblob_sentiment, blob_sentiment,blob_subjectivity, summary, final_time]) pass analyzetweets.close() print("Done building tweets sentiment!")
import math from textblob import Word from textblob.wordnet import VERB from textblob.wordnet import NOUN from textblob import TextBlob from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer analyser = SentimentIntensityAnalyzer() word_noun_forms = Word("match").get_synsets(pos=VERB) word_verb_forms = Word("match").get_synsets(pos=NOUN) word = TextBlob("modified") ant_word = TextBlob("complicated") vad_word = analyser.polarity_scores("modified") vad_word2 = analyser.polarity_scores("complicated") print(vad_word) print(vad_word2) print(word.sentiment.polarity) print(ant_word.sentiment.polarity) #print(word_noun_forms) #print(word_verb_forms)
# Section 11.2.10 snippets from textblob import Word word = Word('varieties') word.stem() word.lemmatize() ########################################################################## # (C) Copyright 2019 by Deitel & Associates, Inc. and # # Pearson Education, Inc. All Rights Reserved. # # # # DISCLAIMER: The authors and publisher of this book have used their # # best efforts in preparing the book. These efforts include the # # development, research, and testing of the theories and programs # # to determine their effectiveness. The authors and publisher make # # no warranty of any kind, expressed or implied, with regard to these # # programs or to the documentation contained in these books. The authors # # and publisher shall not be liable in any event for incidental or # # consequential damages in connection with, or arising out of, the # # furnishing, performance, or use of these programs. # ##########################################################################
def clean_textblob(sentence): stop_words = list(get_stop_words('en')) sentence = re.split('\W+', sentence) sentence = " ".join([w for w in sentence if not w in stop_words]) return "".join([Word(w).lemmatize() for w in sentence])
except IndexError: continue # NLP for the collection of tweets try: # clean up text tweets = standardize_text(tweetDF, "Text") # get rid of stop words tweets['stopwords'] = tweets['Text'].apply(lambda x: len([x for x in x.split() if x in stopTemp])) tweets['Text'] = tweets['Text'].apply(lambda x: " ".join(x for x in x.split() if x not in stopTemp)) # lemmatize tweets['Text'] = tweets['Text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) # tokenize the text strings tweets["tokens"] = tweets["Text"].apply(tokenizer.tokenize) # add a sentence lengths variable sentence_lengths = [len(tokens) for tokens in tweets["tokens"]] tweets['length'] = pd.Series(sentence_lengths, index=tweets.index) # apply sentiment analysis embeddings = get_word2vec_embeddings(word2vec, tweets) totSent = pd.DataFrame(embeddings).mean(axis=0) # create index labels totSent.index = totSent.index.map(str) totSent.index = 's' + totSent.index.astype(str)
def sentimentPredict(df): df_ppm = df original_cols = df_ppm.columns health_plans = pd.Categorical(df_ppm['healthplan']).categories regions = pd.Categorical(df_ppm['region']).categories rptqtrs = pd.Categorical(df_ppm['rptqtr']).categories # In[3]: df_ppm.shape # In[5]: df_ppm.columns # In[4]: print(df_ppm[['sat1', 'sat1_oe']].head(10)) # In[274]: #for d in df_ppm.describe().columns: # print (df_ppm.describe()[d]) # In[5]: df_ppm.info() # In[5]: df_ppm.isnull().sum() # In[7]: df_ppm.dtypes # In[6]: df_ppm['sat1'] = df_ppm['sat1'].astype(str) for ind, row in df_ppm.iterrows(): try: if (row['sat1_oe'] == '') | (row['sat1_oe'] == ' '): if (row['sat1'] != '') & (float(row['sat1']) > 8.0): df_ppm.loc[ind, 'sat1_oe'] = 'Positive Feedback' elif (row['sat1'] != '') & (float(row['sat1']) <= 7.0): df_ppm.loc[ind, 'sat1_oe'] = 'Negative Feedback' elif (row['sat1'] != '') & (float(row['sat1']) == 8.0): df_ppm.loc[ind, 'sat1_oe'] = 'Passive Feedback' elif row['sat1'] == '': df_ppm.loc[ind, 'sat1_oe'] = '' else: continue except: continue # In[29]: df_ppm.head(10) # In[7]: print(df_ppm[['sat1', 'sat1_oe']].head(10)) # In[8]: #length of comments data_nonnull = df_ppm[(df_ppm['sat1_oe'] != '') & (df_ppm['sat1_oe'] != ' ')] data_nonnull.shape # In[9]: data_nonnull['sat1'] = data_nonnull['sat1'].astype(str) dfppm = data_nonnull[(data_nonnull['sat1'] != '')] # In[10]: dfppm.shape # In[11]: dfppm.reset_index(drop=True, inplace=True) dfppm['review_length'] = dfppm['sat1_oe'].apply(lambda x: len(str(x))) dfppm[['sat1_oe', 'review_length']].head(30) # In[12]: #word count dfppm['word_count'] = dfppm['sat1_oe'].apply( lambda t: len(str(t).split(" "))) dfppm[['sat1_oe', 'word_count']].head(20) # In[13]: dfppm.shape # In[14]: dfppm.columns # In[15]: def avg_word_len(text): tokens = str(text).split(" ") word_l = [len(word) for word in tokens] total_l = sum(word_l) avg_l = total_l / len(word_l) return (avg_l) #average word length dfppm['avg_word_length'] = dfppm['sat1_oe'].apply( lambda t: avg_word_len(t)) dfppm[['sat1_oe', 'avg_word_length']].head(10) # In[16]: #replace quotation with nothing dfppm['cleaned'] = dfppm['sat1_oe'].apply( lambda t: str(t).replace("'", "")) #convert into lower case dfppm['cleaned'] = dfppm['cleaned'].str.lower() #replace punctuations dfppm['cleaned'] = dfppm['cleaned'].str.replace('[^\w\s]', '') dfppm[['sat1_oe', 'cleaned']].head(10) # In[13]: #Stopwords removal def prepareStopWords(): stopwordsList = [] # Load default stop words and add a few more specific to my text. stopwordsList = stopwords.words('english') stopwordsList.append('dont') stopwordsList.append('didnt') stopwordsList.append('doesnt') stopwordsList.append('cant') stopwordsList.append('couldnt') stopwordsList.append('couldve') stopwordsList.append('im') stopwordsList.append('ive') stopwordsList.append('isnt') stopwordsList.append('theres') stopwordsList.append('wasnt') stopwordsList.append('wouldnt') stopwordsList.append('a') stopwordsList.append('also') stopwordsList.append('could') stopwordsList.append('would') stopwordsList.append('might') stopwordsList.append('may') return stopwordsList stop = prepareStopWords() stop = [i.replace("'", "") for i in stop] dfppm['bagofwords'] = dfppm['cleaned'].apply( lambda x: " ".join(x for x in x.split() if x not in stop)) dfppm[['sat1_oe', 'cleaned', 'bagofwords']].head(10) # In[14]: #common words removal #selecting only 5 because if it is more than 10 then the token reimbursement will be lost freq = pd.Series(' '.join(dfppm['bagofwords']).split()).value_counts() print(freq) # In[108]: #common words removal #selecting only 5 because if it is more than 10 then the token reimbursement will be lost freq10 = pd.Series(' '.join( dfppm['bagofwords']).split()).value_counts()[:10] print(freq10) # In[287]: #common words removal #selecting only 5 because if it is more than 10 then the token reimbursement will be lost freq5 = pd.Series(' '.join(dfppm['bagofwords']).split()).value_counts()[:6] print(freq5) # In[242]: type(freq5) # In[243]: freq5.index # In[15]: freq_s = freq.loc[[ 'uhc', 'insurance', 'unitedhealthcare', 'unitedhealthcares', 'patients', 'patient', 'insurances', 'united', 'uniteds', 'healthcare' ], ] freq_s # In[16]: #all_words=pd.Series(' '.join(dfppm['bagofwords']).split()).value_counts() #freq5 = list(freq5.index) dfppm['bagofwords'] = dfppm['bagofwords'].apply( lambda x: " ".join(x for x in x.split() if x not in freq_s)) dfppm[['sat1_oe', 'cleaned', 'bagofwords']].head(10) # In[17]: #Rare words removal #Optional in_freq = freq = pd.Series(' '.join( dfppm['bagofwords']).split()).value_counts()[-5:] print(in_freq) # In[18]: #Optional in_freq = list(in_freq.index) dfppm['bagofwords_rarewords'] = dfppm['bagofwords'].apply( lambda x: " ".join(x for x in x.split() if x not in in_freq)) dfppm.columns # In[116]: #Spelling corrections #spell_checked_comments = dfppm['bagofwords'].apply(lambda t: str(TextBlob(t).correct())) #dfppm['spell_checked'] = spell_checked_comments #dfppm[['sat1_oe','cleaned','spell_checked','bagofwords']].head(20) # In[19]: print(dfppm.columns) #Stemming st = PorterStemmer() dfppm['stemmed_text'] = dfppm['bagofwords'].apply( lambda x: " ".join([st.stem(word) for word in x.split()])) dfppm['stemmed_text'].head(10) # In[20]: #Lemmatization #fb_corpus = " ".join(t for t in total_df['feedback']) #wnl = nltk.WordNetLemmatizer() #content_tokens = fb_corpus.split(" ") #cleaned_tokens = [x for x in content_tokens if x is not ""] #CORPUS_tokens = [wnl.lemmatize(t) for t in cleaned_tokens] #CORPUS = " ".join(CORPUS_tokens) from textblob import Word dfppm['bagofwords'] = dfppm['bagofwords'].apply( lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) dfppm['bagofwords'].head(5) # In[21]: #TF-IDF matrix tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word', stop_words='english', ngram_range=(1, 1)) sklearn_tfidf = TfidfVectorizer(norm='l2', min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=False) tfidf_set = sklearn_tfidf.fit_transform(dfppm['bagofwords']) tfidf_set # In[38]: #Polarity sia = SentimentIntensityAnalyzer() dfppm['polarity'] = dfppm['cleaned'].apply( lambda s: sia.polarity_scores(s)) dfppm['polarity_compound'] = dfppm['cleaned'].apply( lambda s: sia.polarity_scores(s)['compound']) dfppm['polarity_positive'] = dfppm['cleaned'].apply( lambda s: sia.polarity_scores(s)['pos']) dfppm['polarity_negative'] = dfppm['cleaned'].apply( lambda s: sia.polarity_scores(s)['neg']) dfppm['polarity_neutral'] = dfppm['cleaned'].apply( lambda s: sia.polarity_scores(s)['neu']) dfppm['subjectivity'] = dfppm['cleaned'].apply( lambda s: TextBlob(s).sentiment.subjectivity) #total_df.to_csv(os.path.join(path,'new_scores_ppm.csv'),index=False) dfppm[[ 'sat1_oe', 'cleaned', 'bagofwords', 'polarity', 'polarity_compound', 'polarity_positive', 'polarity_negative', 'polarity_neutral' ]].head(10) #Tested with both 'cleaned' and 'bagofwords' headers to see how sentiment values are varying # In[24]: dfppm.shape # In[172]: dfppm.columns # In[25]: parent_df = dfppm # In[26]: parent_df.shape # In[175]: parent_df.dtypes # In[27]: pd.Categorical(parent_df['sat1']).categories # In[28]: parent_df.isnull().sum() # In[31]: parent_df = parent_df[parent_df['sat1'] != 'nan'] parent_df.shape # In[29]: parent_df.select_dtypes(include=['O']).columns #sat2 is a string here - actually should be numeric # In[30]: parent_df['sat1'] = parent_df['sat1'].astype(int) parent_df.select_dtypes(include=['int32', 'int64']).columns # In[31]: pos_ppm_nps = parent_df[(parent_df['sat1'] == 9) | (parent_df['sat1'] == 10)] neu_ppm_nps = parent_df[(parent_df['sat1'] == 8)] neg_ppm_nps = parent_df[parent_df['sat1'] < 8] # In[32]: pos_ppm_nps.shape # In[36]: neg_ppm_nps.shape # In[134]: neu_ppm_nps.shape # In[42]: for ind, row in parent_df.iterrows(): if (parent_df.loc[ind, 'sat1'] == 8): parent_df.loc[ind, 'sentiment'] = 'neutral' elif (parent_df.loc[ind, 'sat1'] < 8): parent_df.loc[ind, 'sentiment'] = 'negative' elif (parent_df.loc[ind, 'sat1'] > 8): parent_df.loc[ind, 'sentiment'] = 'positive' value_cnts_actual = parent_df['sentiment'].value_counts() print(value_cnts_actual) # In[43]: for ind, row in parent_df.iterrows(): if (parent_df.loc[ind, 'polarity_positive'] > parent_df.loc[ind, 'polarity_negative']): parent_df.loc[ind, 'sentiment_predicted'] = 'positive' elif (parent_df.loc[ind, 'polarity_positive'] < parent_df.loc[ind, 'polarity_negative']): parent_df.loc[ind, 'sentiment_predicted'] = 'negative' elif (parent_df.loc[ind, 'polarity_positive'] == parent_df.loc[ ind, 'polarity_negative']): parent_df.loc[ind, 'sentiment_predicted'] = 'neutral' value_cnts_pred = parent_df['sentiment_predicted'].value_counts() print(value_cnts_pred) # In[44]: actuals = parent_df['sentiment'] predicted = parent_df['sentiment_predicted'] actuals_list = parent_df['sentiment'].tolist() predicted_list = parent_df['sentiment_predicted'].tolist() vader_conf_matrix = pd.crosstab(actuals, predicted) confusionMatrix = ConfusionMatrix(actuals_list, predicted_list) confusionMatrix.print_stats() cmatrix = confusion_matrix(actuals_list, predicted_list) print(classification_report(actuals_list, predicted_list)) accuracy_score(actuals_list, predicted_list) # In[45]: parent_df['textblob_polarity'] = parent_df['cleaned'].apply( lambda s: TextBlob(s).sentiment.polarity) # In[41]: parent_df.columns # In[46]: for ind, row in parent_df.iterrows(): if (parent_df.loc[ind, 'textblob_polarity'] > 0): parent_df.loc[ind, 'textblob_sentiment'] = 'positive' elif (parent_df.loc[ind, 'textblob_polarity'] < 0): parent_df.loc[ind, 'textblob_sentiment'] = 'negative' elif (parent_df.loc[ind, 'textblob_polarity'] == 0): parent_df.loc[ind, 'textblob_sentiment'] = 'neutral' textblob_sentiments = parent_df['textblob_sentiment'].tolist() tb_confMatrix = ConfusionMatrix(actuals_list, textblob_sentiments) #tb_cm = pd.crosstab(actuals_list,textblob_sentiments) tb_confMatrix.print_stats() finalCols = list(original_cols) + [ 'polarity_compound', 'polarity_positive', 'polarity_negative', 'polarity_neutral', 'sentiment', 'sentiment_predicted' ] visualdf = parent_df[finalCols] return (visualdf)
targetKeywords = di['targetKeywords'] #targetParagraphs = [di['targetParagraphs']] targetParagraphs = di['targetParagraphs'] targetTitle = di['targetTitle'] # In[403]: tstart = time.time() # In[404]: #Vector dimension extraction postText_cleaned_splitted = [ Word(i).lemmatize().strip().lower() for i in word_tokenize(re.sub(r'([^\s\w]|_)+', '', str(postText))) ] targetTitle_cleaned_splitted = [ Word(i).lemmatize().strip().lower() for i in word_tokenize(re.sub(r'([^\s\w]|_)+', '', str(targetTitle))) ] targetDescription_cleaned_splitted = [ Word(i).lemmatize().strip().lower() for i in word_tokenize(re.sub(r'([^\s\w]|_)+', '', str(targetDescription))) ] #targetParagraphs_cleaned_splitted = [Word(j).lemmatize().strip().lower() for i in eval(str(targetParagraphs)) for j in word_tokenize(re.sub(r'([^\s\w]|_)+', '', str(targetParagraphs)))] targetParagraphs_cleaned_splitted = [ Word(i).lemmatize().strip().lower() for i in word_tokenize(re.sub(r'([^\s\w]|_)+', '', str(targetParagraphs)))
def lemmatize_ingredient(single_ingredient): words = single_ingredient.split() converted_words = [Word(word) for word in words] lemmatized_words = [converted_word.lemmatize() for converted_word in converted_words] lemmatized_text = " ".join(lemmatized_words) return(lemmatized_text)
sw = stopwords.words('english') df['Phrase'] = df['Phrase'].apply( lambda x: " ".join(x for x in x.split() if x not in sw)) df.head() #Deleting Less Words sil = pd.Series(' '.join(df['Phrase']).split()).value_counts()[-1000:] df['Phrase'] = df['Phrase'].apply( lambda x: " ".join(x for x in x.split() if x not in sil)) #Lemmization from textblob import Word nltk.download('wordnet') df['Phrase'] = df['Phrase'].apply( lambda x: " ".join([Word(i).lemmatize() for i in x.split()])) df.head() df.info() from sklearn import preprocessing encoder = preprocessing.LabelEncoder() df["sonuç"] = encoder.fit_transform(df["sonuç"]) df.head() from sklearn.model_selection import train_test_split train_x, test_x, train_y, test_y = train_test_split(df["Phrase"], df["sonuç"],
for line in low: #print line line = line.translate(None, '",.()!;/?0123456789') #过滤标点和数字 #tem = TextBlob(line) #s = tem.correct() list = [] for word in line.split(): list.append(str(word).strip()) wordDict[i] = list i = i + 1 #SpellCorrect 这里使用的是TEXTBLOB库。花时间较长。 correct_Dict = {} for index, words in wordDict.items(): list = [] for word in words: w = Word(word) list.append(w.correct()) correct_Dict[index] = list #Singularization 使用TEXTBLOB,花时间较长。 singular_Dict = {} for index, words in correct_Dict.items(): list = [] for word in words: list.append(str(word.singularize())) singular_Dict[index] = list #stemming 词干提取 提取后有些单词不知道什么意思了。所以注释掉了。使用的nltk ''' stemed_Dict = {}#词干提取后的词典 from nltk.stem import PorterStemmer pst = PorterStemmer()
def lemmatize(self, list_of_tokens): # Given a tokenized text, lemmatizes all nouns (otherwise it requires POS analisys) # THIS TAKES A LONG TIME EVEN FOR A SHORT STRING pos_tags = [TextBlob(e).tags[0] for e in list_of_tokens] return [Word(word).lemmatize(wordnet_pos_code(tag)) for (word, tag) in pos_tags]
from textblob import TextBlob, Word sentence = TextBlob("he wass verry siccck ") print(sentence.correct()) word = Word("gooa") print(word.spellcheck())
## 2. Removing digits df['user_story'] = df['user_story'].apply(lambda x: ''.join([x for x in x if not x.isdigit()])) ## 2.4 Common word (removal) pd.Series(' '.join(df['user_story']).split()).value_counts()[:10] # uncommon words pd.Series(' '.join(df['user_story']).split()).value_counts()[-10:] ## 2.5 Tokenization ## 2.6 Stemming user_story_stemmed = df['user_story'].apply(lambda x: " ".join([ps.stem(word) for word in x.split()])) ## 2.7 Lemmatization user_story_lemmatized = df['user_story'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) df['user_story'] = df['user_story'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) # remove stopwords again since corpus is now lematized # first lemmatize the added stopwords stop_words.add(Word("user").lemmatize()) stop_words.add(Word("want").lemmatize()) stop_words.add(Word("able").lemmatize()) stop_words.add(Word("would").lemmatize()) stop_words.add(Word("like").lemmatize()) stop_words.add(Word("need").lemmatize()) # filter out stopwords df['user_story'] = df['user_story'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words)) #### Advanced text processing