def preprocessing(tagged_by_Sentence): """ 1. 특수문자 제거, 소문자 2. not, "n't" -> not_stemming(다음단어) 3. 특수문자 제거, 숫자 제거 4. stopword 제거 5. stemming """ from nltk.corpus import stopwords stopwords = stopwords.words('english') stopwords.remove("not") stopwords.remove('very') stopwords.append("'m") stopwords.append("'s") re_special = re.compile('[^A-Za-z0-9]+') # 문자,숫자 제외한 나머지 re_num = re.compile('[0-9]+') # 숫자 st = PorterStemmer() new_sent = [] not_indice = [] for sent in tagged_by_Sentence: text = [(tup[0].lower(), tup[1]) for tup in sent if not bool(re_special.match(tup[0]))] # 1. 특수문자 제거, 소문자 # 2. not, n't 랑 다음단어 합치기 # not, n't 가 나오면 다음 단어랑 합치고, 그 다음 단어의 index를 저장해놨다가 del_element_by_indice 함수에서 제거 new_text = [] for index, tup in enumerate(text): if tup[0] == "n't" or tup[0] == "not": if index + 1 < len(text): if not bool(re_special.match( text[index + 1][0])) or text[index + 1][1] != 'CD': new_text.append("not_" + st.stem(text[index + 1][0])) not_indice.append(index) else: new_text.append("not") else: if not bool(re_num.match( tup[0])) or tup[1] != 'CD': # 3. 특수문자, 숫자 제거 new_text.append(tup[0]) new_text = del_element_by_indice(new_text, not_indice) new_words = [ st.stem(word) for word in new_text if word not in stopwords ] # 4,5 stopword 제거, stemming new_sent.append(new_words) return new_sent
def __filtering_sastrawi(self, documents): stop_factory = StopWordRemoverFactory().get_stop_words() list_stop = stop_factory + self.stop_more dictionary = ArrayDictionary(list_stop) stopwords = StopWordRemover(dictionary) stop = stopwords.remove(documents) return stop
def removeStopWords(sentences, stopwords=None): ''' :param sentences: list of sentences :param stopwords: list of stopwords :return:list of sentences without stopwords ''' if stopwords == None: from nltk.corpus import stopwords stopwords = stopwords.words('english') stopwords.remove('most') sentences1 = [] for sent in sentences: newsent = '' for word in word_tokenize(sent): if word not in stopwords: newsent = newsent + ' ' + word sentences1.append(newsent) return sentences1
def removeStopWords(sentences,stopwords=None): ''' :param sentences: list of sentences :param stopwords: list of stopwords :return:list of sentences without stopwords ''' if stopwords==None: from nltk.corpus import stopwords stopwords=stopwords.words('english') stopwords.remove('most') sentences1=[] for sent in sentences: newsent='' for word in word_tokenize(sent): if word not in stopwords: newsent = newsent+' '+word sentences1.append(newsent) return sentences1
def stopwords_e_pontuacao(self, instancia): ## tokenizar com nltk instancia = instancia.split() ### remove punctuation from each word table = str.maketrans('', '', string.punctuation) instancia = [w.translate(table) for w in instancia] ### convert to lower case and remove everything that is not alphabetic instancia = [word for word in instancia if word.isalpha()] ## filter out StopWords stopwords = nltk.corpus.stopwords.words('portuguese') + [ 'aqui', 'a', 'rs', 'é', '/', 'fdp', '%', 'pfvr', 'cadê', 'né', 'q', 'pq', '#', '@', 'mt', 'youtube', 'hj', 'dnv', 'mto', 'vc', 'eh', 'r$', 'rt', 'via', 'vía' ] stopwords.remove("não") instancia = [w for w in instancia if not w in stopwords] ## detokenizer (necessary to pass as arg to make an textblob object) with MosesDetokenizer('pt') as detokenize: instancia = detokenize(instancia) return instancia
def update_stopwords(self, add_words=[], remove_words=[], update_corpus=True): stopwords = self.stopwords [stopwords.append(x) for x in add_words] [stopwords.remove(x) for x in remove_words if x in stopwords] self._stopwords_ = stopwords if update_corpus: self.prepare_corpus()
import nltk from nltk.corpus import stopwords from nltk.stem import PorterStemmer from nltk.tokenize import TweetTokenizer nltk.download('punkt') nltk.download('stopwords') punctuation = list(string.punctuation) stopwords = stopwords.words('english') http_link = 'https://' # remove negation words from stopwords neg_words = ['no', 'nor', 'not', 'wasn', 'weren'] for word in neg_words: stopwords.remove(word) def process(tweet): tweet = reduce_lengthening(tweet) token_list = tokenize(tweet) processed_token_list = process_token(token_list) stem_token_list = stemming(processed_token_list) return stem_token_list def tokenize(tweet): tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True) token_list = tokenizer.tokenize(tweet) return token_list
import re from keras.preprocessing.text import Tokenizer from keras.utils import np_utils from keras.models import Sequential, load_model from keras.layers import Dense, Dropout, LSTM, Bidirectional from collections import Counter from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from sklearn.model_selection import train_test_split stopwords = stopwords.words('english') newStopWords = ['', ' ', ' ', ' ', ' ', ' s'] stopwords.extend(newStopWords) stopwords.remove('no') stopwords.remove('not') stopwords.remove('very') stop_words = set(stopwords) def clean_doc(doc, vocab=None): tokens = word_tokenize(doc) tokens = [re.sub('[^a-zA-Z]', ' ', word) for word in tokens] tokens = [word.lower() for word in tokens] tokens = [w for w in tokens if not w in stop_words] tokens = [word for word in tokens if len(word) > 1] if vocab: tokens = [w for w in tokens if w in vocab] tokens = ' '.join(tokens) return tokens
posneg_feature_vectors_test=[] posneg_feature_vectors_train = [] posneg_feature_vectors_test = [] full_data=[] label_vector = [] pos = dict() neg = dict() posneg = dict() part_of_speech=[] #global_index = 0 set_size= 8000 top_k_features = 200 end_index=0 count=0 stopwords = nltk.corpus.stopwords.words('english') stopwords.remove('not') train_size=0.9*set_size def _read_data(file_name): """ :rtype : object """ row_cnt=-1; with open(file_name, 'rb') as tsvin: tsvin = csv.reader(tsvin, delimiter='\t') index = 0 for row in tsvin:
#Saving the np array into a text file np.savetxt('train_p.txt', p, delimiter=' ', fmt='%s',encoding="utf-8") np.savetxt('train_n.txt', n, delimiter=' ', fmt='%s',encoding="utf-8") # reading the text files and removing the Stop Words: d = path.dirname('.') textp_w = open(path.join(d, 'train_p.txt'),encoding='utf-8').read() textn_w = open(path.join(d, 'train_n.txt'),encoding='utf-8').read() stopwords = set(STOPWORDS) stopwords.add("said") stopwords.add("br") stopwords.add(" ") stopwords.remove("not") stopwords.remove("no") #stopwords.remove("good") #stopwords.remove("love") stopwords.remove("like") #stopwords.remove("best") #stopwords.remove("!") print ("Total number of words in duplicate pair questions :",len(textp_w)) print ("Total number of words in non duplicate pair questions :",len(textn_w)) wc = WordCloud(background_color="white", max_words=len(textp_w), stopwords=stopwords) wc.generate(textp_w) print ("Word Cloud for Duplicate Question pairs") plt.imshow(wc, interpolation='bilinear')
import tensorflow as tf import tensorflow_hub as hub from model.utils import embedding_metric, Tokenizer, detokenize from torchMoji.api.botmoji import Botmoji from inferSent.api.botsent import Botsent from Toxicity.toxic import NBLogisticRegression, NBTfidfVectorizer, tokenize EPSILON = np.finfo(np.float32).eps ROOT_DIR = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) tokenizer = Tokenizer('spacy') stopwords = stopwords.words('english') question_words = {'who', 'what', 'why', 'where', 'how', 'when'} _ = [stopwords.remove(q) for q in question_words] punct = list(string.punctuation) contractions = ["'s", "'d", "'ld", "n't", "'re", "'ll", "'ve"] filters = set(stopwords + contractions + punct) def _get_emojis(): # All emojis in the order returned by deepmoji EMOJIS = ":joy: :unamused: :weary: :sob: :heart_eyes: :pensive: " + \ ":ok_hand: :blush: :heart: :smirk: :grin: :notes: :flushed: " + \ ":100: :sleeping: :relieved: :relaxed: :raised_hands: " + \ ":two_hearts: :expressionless: :sweat_smile: :pray: " + \ ":confused: :kissing_heart: :heartbeat: :neutral_face: " + \ ":information_desk_person: :disappointed: :see_no_evil: " + \ ":tired_face: :v: :sunglasses: :rage: :thumbsup: :cry: " + \ ":sleepy: :yum: :triumph: :hand: :mask: :clap: :eyes: :gun: " + \
val["ld"] = str(link_data) val["wn"] = str(workflow_name) val["un"] = str(user_data) val["stu"] = start_time_user resp = jsonify(val) resp.headers['Access-Control-Allow-Origin'] = '*' return resp from nltk.corpus import stopwords stopwords = list(stopwords.words('english')) stopwords.extend(list(string.punctuation)) stopwords.append("i\'ve") stopwords.append("i\'m") stopwords.remove("no") stopwords.remove("not") stopwords.remove("than") stopwords.remove("which") stopwords.remove("or") def remove_stopwords(text): """custom function to remove the stopwords""" return " ".join( [word for word in str(text).split() if word not in stopwords]) @app.route('/uploader', methods=['GET', 'POST']) def upload_file(): if request.method == 'POST':
list_7up_cocacola=['7 Up','7 UP','7 up','7 uP','7-Up','7-UP','7.up','7-up','7-uP','7up','7UP','7Up','7uP','coca cola','COCA COLA','coca-cola','COCACOLA'] cola_list=['cola','coca','coke'] softdrink_list=['pepsi','mirinda','cocacola','thumbs up','coca-cola','7up','mirinda','sprite','fanta','limca','twister'] prep=['with','in','over','by','above','at','from','on','about'] list_bbq=['b b q','b l t'] quantity=['lb','oz','ozs','lbs''plate','seasonal','little','per','v.','big','small','large','medium','can','cans','per','l','L','glass','ly','ml','litre','gram','grams','gm','gms','kg','kgs','cl','pcs','pieces','piece','bottle','bottles','large','medium','med','small','inch','inches','g'] phrase=['small bottle of','small bottles of','large bottle of','large bottles of','per glass','per bottle','big bowl of','bowl of','bottle of','bottles of'] stopwords = list(set(stopwords.words('english'))) stopwords.remove('and') stopwords.extend(['light','extra','addon','add-on','extras','spare','day','spare','double','alacarte','regular','fresh','homemade','bowl','plate','little','hs','HS','add','cup','however','often','widest','special','children','review','reviews','authentic']) # print stopwords remove_phrase=['freshly brewed','ala carte','add on'] menu_phrase_remove=[] def punct(menu,punctuations): for char in menu: if char in punctuations: menu=menu.replace(char,' ') menu=' '.join(menu.split()) return menu outputfile = open("output.csv","wb") writer = csv.writer(outputfile)
import re import emoji import nltk #--------------------------------------------------------------------------------# from flask import Flask, request, render_template from flask_restful import Api, Resource from textblob import TextBlob #--------------------------------------------------------------------------------# from nltk.corpus import sentiwordnet as swn from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer #--------------------------------------------------------------------------------# from nltk.corpus import stopwords stopwords = stopwords.words('english') stopwords.remove("not") stopwords.remove("no") stopwords.remove("nor") stopwords.remove("above") #--------------------------------------------------------------------------------# lemma = WordNetLemmatizer() # Do this first, that'll do something eval() # to "materialize" the LazyCorpusLoader next(swn.all_senti_synsets()) #--------------------------------------------------------------------------------# pattern = '@\S+|https?:\S+|http?:\S|[^A-Za-z]+|com|net' urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)" userPattern = '@[^\s]+' alphaPattern = "[^a-zA-Z0-9]"
from nltk.corpus import stopwords from lesk import simple_lesk, original_lesk from similarity import max_similarity from utils import lemmatize, lemmatize_sentence """ This is a module for all-words full text WSD (modified for use in tropical_models framework) This would involve: Step 1: First tokenize your text such that each token is separated by whitespace Step 2: Iterates through the tokens and only disambiguate the content words. """ stopwords = stopwords.words('english') + list(punctuation) stopwords.remove('is') stopwords.remove('are') stopwords.remove('was') stopwords.remove('had') stopwords.remove('being') stopwords.remove('were') stopwords.remove('been') stopwords.remove('has') stopwords.remove('be') def disambiguate(sentence, algorithm=simple_lesk, context_is_lemmatized=False, similarity_option='path', keepLemmas=False,
import pandas as pd import nltk import re from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.tokenize import RegexpTokenizer df = pd.read_csv("train.csv") nltk.download('stopwords') i = nltk.corpus.stopwords.words('english') stopwords = set(i) stopwords.remove("not") stopwords.remove("against") stopwords.remove("no") def preprocess(x): if (type(x) == str): x = re.sub('[^a-z\s]', '', x.lower()) x = re.sub(r'[^\w\s]', "", x) x = [w for w in x.split() if w not in set(stopwords)] return x g = [] for i in range(0, len(df)): y = preprocess(df['text'][i])
from nltk.corpus import stopwords from src.helpers.debug import top_keys import re stopwords = set(stopwords.words('english')) stopwords.remove('don') stopwords.remove('will') # filter out token def valid_tkn(tkn, valid_kw, invalid_kw): tkn = tkn.lower() if tkn in valid_kw: return True if tkn in invalid_kw: return False # stopwords if tkn in stopwords: return False # ampersand and twitter link twitter_stop = ['&', 'rt', 'http'] if '//t.co/' in tkn or tkn in twitter_stop: return False # special unicode character if any(ord(c) > 128 for c in tkn): return False regex = re.compile('[^a-zA-Z]')
# import tensorflow_datasets as tfds import nltk import pandas as pd from nltk.corpus import stopwords nltk.download('stopwords') stopwords = stopwords.words('english') stopwords.remove('not') pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) pd.set_option('display.max_colwidth', 100) # df_gen_1 = pd.read_csv('forum_content.csv', names=["link", "content", "label"]) # df_gen_2 = pd.read_csv('forum_content_gen.csv', names=["link", "content", "label"]) # df_am_1 = pd.read_csv('forum_content_literature_am.csv', names=["link", "content", "label"]) # df_am_2 = pd.read_csv('forum_content_am.csv', names=["link", "content", "label"]) # # df_gen_1['forum_type'] = "GENERAL_FORUM" # df_gen_2['forum_type'] = "GENERAL_FORUM" # df_am_1['forum_type'] = "AFRICAN_AMERICAN_GENERAL_FORUM" # df_am_2['forum_type'] = "AFRICAN_AMERICAN_GENERAL_FORUM" df_af_am_forum = pd.read_csv('content_am_new_debug.csv', names=[ "member_no", "content", "threadUrl", "title", "postDate", "is_am", "author" ]) # df_wm_forum = pd.read_csv('content_gen.csv', names=["member_no","content","threadUrl","title","postDate","is_am","author"])
def preprocess_text(df, columnname): import warnings import nltk from nltk import FreqDist nltk.download('punkt') import pandas as pd nltk.download('stopwords') warnings.filterwarnings("ignore") import re df = df.drop_duplicates(subset=columnname) print(df.shape) df[columnname] = df[columnname].map( lambda x: re.sub(r'http\S+', '', str(x))) df[columnname] = df[columnname].map( lambda x: re.sub(r'[^ a-zA-Z0-9!?:,.\'=]', '', str(x))) df[columnname] = df[columnname].str.lower() # Expand contractions import re contractions_dict = { 'didn\'t': 'did not', 'don\'t': 'do not', "aren't": "are not", "can't": "cannot", "cant": "cannot", "can't've": "cannot have", "'cause": "because", "could've": "could have", "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not", "didnt": "did not", "doesn't": "does not", "doesnt": "does not", "don't": "do not", "dont": "do not", "hadn't": "had not", "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", "he'd": "he had", "he'd've": "he would have", "he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is", "i'd": "i had", "i'd've": "i would have", "i'll": "i will", "i'm": "i am", "im": "i am", "i've": "i have", "isn't": "is not", "it'll": "it will", "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have", "mightn't": "might not", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she had", "she'd've": "she would have", "she'll": "she will", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "that's": "that is", "there's": "there is", "they'd": "they had", "they'd've": "they would have", "they'll": "they will", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we had", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what're": "what are", "what's": "what is", "what've": "what have", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who's": "who is", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "you'll": "you will", "you're": "you are", "you've": "you have" } contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys())) def expand_contractions(s, contractions_dict=contractions_dict): def replace(match): return contractions_dict[match.group(0)] return contractions_re.sub(replace, s) df[columnname] = df[columnname].apply(expand_contractions) df = df.reset_index(drop=False) data = df[[columnname, 'index']] print(data.columns) data.rename(columns={'index': 'INDEX'}, inplace=True) from nltk.tokenize import sent_tokenize data['split'] = data[columnname].apply(sent_tokenize) data_split = data.set_index('INDEX').split.apply( pd.Series).stack().reset_index(level=0).rename(columns={0: columnname}) data_split.reset_index(level=0, inplace=True) data_split.rename(columns={ 'INDEX': 'review_no', 'index': 'sentence' }, inplace=True) # Spell Correct Algorithm from symspellpy.symspellpy import SymSpell # import the module max_edit_distance_dictionary = 0 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = "./frequency_dictionary_en_82_765.txt" term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file #if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): sym_spell.load_dictionary(dictionary_path, term_index, count_index) data_split[columnname] = data_split[columnname].apply( sym_spell.word_segmentation) data_split[columnname] = data_split[columnname].apply( lambda x: x.corrected_string) from Sentiment_prediction_noor import Process_and_predict data_split = Process_and_predict(data_split, columnname) df.rename(columns={'index': 'review_no'}, inplace=True) #print(df.head()) df.rename(columns={columnname: "Text"}, inplace=True) data_final = data_split.merge(df, on="review_no", how='left') print(data_final.columns) #print(data_final.head(2)) # Natural Language Processing of Reviews (Top Features, Feelings, Actions) import spacy from spacy import displacy from collections import Counter import en_core_web_sm nlp = en_core_web_sm.load() from nltk.corpus import stopwords stopwords = stopwords.words('english') newStopWords = ['-PRON-'] stopwords.extend(newStopWords) words2 = ['no', 'nor', 'not'] for word in list( stopwords ): # iterating on a copy since removing will mess things up if word in words2: stopwords.remove(word) # remove short words (length =< 0) final = data_final[columnname].apply( lambda x: ' '.join([w for w in x.split() if len(w) > 0])) final = pd.DataFrame(final) #tokenized_reviews = pd.Series(reviews).apply(lambda x: x.split()) tokenized_reviews = pd.Series(final[columnname]).apply(lambda x: x.split()) #def lemmatization(texts, tags=['NOUN', 'ADJ']): #def lemmatization(texts, tags=['NOUN', 'ADJ', 'VERB']): def lemmatization(texts, tags=['NOUN']): output = [] for sent in texts: doc = nlp(" ".join(sent)) output.append( [token.lemma_ for token in doc if token.pos_ in tags]) return output def lemmatization_adj(texts, tags=['ADJ']): output = [] for sent in texts: doc = nlp(" ".join(sent)) output.append( [token.lemma_ for token in doc if token.pos_ in tags]) return output def lemmatization_verb(texts, tags=['VERB']): output = [] for sent in texts: doc = nlp(" ".join(sent)) output.append( [token.lemma_ for token in doc if token.pos_ in tags]) return output noun_adj_pairs = [] noun_adj_sent = [] import spacy for l in range(len(final)): doc = nlp(str(final[columnname][l])) noun_adj_pairs = [] for i, token in enumerate(doc): #print(token) if token.pos_ not in ('NOUN', 'PROPN'): continue for j in range(i + 1, len(doc)): if doc[j].pos_ == 'ADJ': noun_adj_pairs.append((token, doc[j])) break noun_adj_sent.append(noun_adj_pairs) final['noun_adj'] = noun_adj_sent #Noun Extraction print("Beginning Noun Extraction") reviews_noun = lemmatization(tokenized_reviews) reviews_3 = [] for i in range(len(reviews_noun)): reviews_3.append(' '.join(reviews_noun[i])) final['features'] = reviews_3 print("Noun Extraction Complete") #Adjective Extraction print("Beginning Adjectives Extraction") reviews_adj = lemmatization_adj(tokenized_reviews) reviews_4 = [] for i in range(len(reviews_adj)): reviews_4.append(' '.join(reviews_adj[i])) final['feelings'] = reviews_4 print("Adjectives Extraction Complete") #Verb Extraction print("Beginning Verb Extraction") reviews_verb = lemmatization_verb(tokenized_reviews) reviews_5 = [] for i in range(len(reviews_verb)): reviews_5.append(' '.join(reviews_verb[i])) final['action'] = reviews_5 print("Verb Extraction Complete") # remove short words (length =< 3) final['features'] = final['features'].apply( lambda x: ' '.join([w for w in x.split() if len(w) > 2])) final['feelings'] = final['feelings'].apply( lambda x: ' '.join([w for w in x.split() if len(w) > 2])) final['action'] = final['action'].apply( lambda x: ' '.join([w for w in x.split() if len(w) > 2])) #final['adverb'] = final['adverb'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2])) # Remove Stopwords and Spaces (Start and End) def remove_stopwords(text): text = ' '.join( [word for word in text.split() if word not in stopwords]) return text.strip() final['features'] = final['features'].apply(remove_stopwords) final['feelings'] = final['feelings'].apply(remove_stopwords) final['action'] = final['action'].apply(remove_stopwords) final = final.drop([columnname], axis=1) def freq_words(x): all_words = ' '.join([text for text in x]) all_words = all_words.split() #print(all_words) fdist = FreqDist(all_words) words_df = pd.DataFrame({ 'word': list(fdist.keys()), 'count': list(fdist.values()) }) # selecting top 20 most frequent words #d = words_df.nlargest(columns="count", n = length) # take words that covers up to 80th quantile length = int(words_df['count'].quantile(0.85)) #print(length) # sort results - descending d = words_df[words_df['count'] >= length].sort_values(['count'], ascending=False) d = d['word'].tolist() return d feature_filt_dict = freq_words(final['features']) action_filt_dict = freq_words(final['action']) feeling_filt_dict = freq_words(final['feelings']) def remove_min_words(text, dictionary): text = ' '.join([word for word in text.split() if word in dictionary]) return text final['feelings'] = final['feelings'].apply(remove_min_words, dictionary=feeling_filt_dict) final['action'] = final['action'].apply(remove_min_words, dictionary=action_filt_dict) final['features'] = final['features'].apply(remove_min_words, dictionary=feature_filt_dict) #final['adverb'] = final['adverb'].apply(remove_min_words, dictionary = feature_filt_dict) # Remove duplicate words in cell from collections import OrderedDict final['features'] = final['features'].str.split().apply( lambda x: OrderedDict.fromkeys(x).keys()).str.join('') final['feelings'] = final['feelings'].str.split().apply( lambda x: OrderedDict.fromkeys(x).keys()).str.join('') final['action'] = final['action'].str.split().apply( lambda x: OrderedDict.fromkeys(x).keys()).str.join(' ') #final['adverb'] = final['adverb'].str.split().apply(lambda x: OrderedDict.fromkeys(x).keys()).str.join('') print(final.columns) rows = [] _ = final.apply(lambda row: [ rows.append([row['features'], row['feelings'], row['action'], na]) for na in row.noun_adj ], axis=1) final = pd.DataFrame( rows, columns=['features', 'feelings', 'action', 'noun_adj']) #pd.concat([df_new.noun_adj.str.extract('(?P<col1>\d+),(?P<col2>\d+)'),df_new], axis = 1) final = pd.concat([ final.noun_adj.apply( lambda x: pd.Series(x, index=['Feature_n', 'Feeling_adj'])), final ], axis=1) final_data = pd.concat([data_final, final], axis=1) final_data.reset_index(inplace=True) print(final_data.head()) print(final_data.dtypes) ## Keep Reviews that are 3 characters or longer #final_data = final_data[final_data['Text'].apply(lambda x: len(x) > 3)] mask = (final_data['Text'].str.len() > 3) final_data = final_data.loc[mask] print(final_data.columns) final_data = final_data.drop(columns='index') mycolumns = final_data.columns data_final_nlp = final_data[mycolumns] features_file = data_final_nlp[[ 'sentence', 'review_no', columnname, 'Text', 'comp_sentiment', 'features' ]] actions_file = data_final_nlp[[ 'sentence', 'review_no', columnname, 'Text', 'comp_sentiment', 'action' ]] feelings_file = data_final_nlp[[ 'sentence', 'review_no', columnname, 'Text', 'comp_sentiment', 'feelings' ]] noun_adj_file = data_final_nlp[[ 'sentence', 'review_no', columnname, 'Text', 'comp_sentiment', 'Feature_n', 'Feeling_adj' ]] # Explode pandas dataframe string entry to separate rows import pandas as pd import numpy as np def explode(df, lst_cols, fill_value='', preserve_index=False): # make sure `lst_cols` is list-alike if (lst_cols is not None and len(lst_cols) > 0 and not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))): lst_cols = [lst_cols] # all columns except `lst_cols` idx_cols = df.columns.difference(lst_cols) # calculate lengths of lists lens = df[lst_cols[0]].str.len() # preserve original index values idx = np.repeat(df.index.values, lens) # create "exploded" DF res = (pd.DataFrame( {col: np.repeat(df[col].values, lens) for col in idx_cols}, index=idx).assign( **{ col: np.concatenate(df.loc[lens > 0, col].values) for col in lst_cols })) # append those rows that have empty lists if (lens == 0).any(): # at least one list in cells is empty res = (res.append(df.loc[lens == 0, idx_cols]).fillna(fill_value)) # revert the original index order res = res.sort_index() # reset index if requested if not preserve_index: res = res.reset_index(drop=True) return res #Creation of Features, Feelings and Actions File features_file = features_file.assign( features=features_file.features.str.split(' ')) actions_file = actions_file.assign( action=actions_file.action.str.split(' ')) feelings_file = feelings_file.assign( feelings=feelings_file.feelings.str.split(' ')) #adverb_file = adverb_file.assign(adverb=adverb_file.adverb.str.split(' ')) features_file = explode(features_file, ['features'], fill_value='') actions_file = explode(actions_file, ['action'], fill_value='') feelings_file = explode(feelings_file, ['feelings'], fill_value='') #adverb_file = explode(adverb_file, ['adverb'], fill_value='') features_file.to_excel("Features_file.xlsx", index=False) actions_file.to_excel("Actions_file.xlsx", index=False) feelings_file.to_excel("Feelings_file.xlsx", index=False) noun_adj_file.to_excel("Noun_adj_file.xlsx", index=False) return features_file, actions_file, feelings_file, noun_adj_file
Without emperical features, the classification performance is very bad (around 60%) since each deal is very short and no big difference between good and bad deals. I tried liear svm, decision tree, logistic regression with Lasso penalty, naive Bayesian, among which lienar SVM performs the best. - How did you test your classifier? Answer: Randomly sample 1/6 of training data as test data, use 5/6 as training, build model and select parameters. Repat the process for 20 times. Finally compute average accuracy and train the model on all the training data, and apply to test_deals.txt. However test data is unlabeled hence I only output the prediction result. By manually checking, line 42,50,53 mentioned coupon codes, and our prediction results are all 1! """ import nltk import re import numpy as np from sklearn import svm from nltk.corpus import stopwords stopwords = stopwords.words('english') if ('off' in stopwords): stopwords.remove('off') def normalise(word): """Normalises words to lowercase and stems and lemmatizes it.""" stemmer = nltk.PorterStemmer() lemmatizer = nltk.WordNetLemmatizer() word = word.lower() word = stemmer.stem_word(word) word = lemmatizer.lemmatize(word) return word def acceptable_word(word): """Checks conditions for acceptable word: length, stopword.""" word= word.lower() accepted = bool(2 <= len(word) <= 40 and word not in stopwords) return accepted def document_features(document):
from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_extraction.text import TfidfTransformer from sknn.mlp import Classifier, Layer from common import load_data from bc_prog import bc_prog import csv import re import nltk import math tagger = PerceptronTagger() lemmatizer = WordNetLemmatizer() basictizer = bc_prog() stopwords = stopwords.words('english') stopwords.remove("but") stopwords.remove("not") stopwords.remove("no") stopwords.remove("very") english_vocab = set(w.lower() for w in nltk.corpus.words.words()) abbrev_dict = ["'m", "n't", "'s", "'re", "'ve"] def wordnet_pos_code(tag): if tag == None: return '' elif tag.startswith('NN'): return wordnet.NOUN elif tag.startswith('VB'): return wordnet.VERB elif tag.startswith('JJ'):
#plt.plot([0,1],[0,1],'r--') #plt.xlim([-0.1,1.2]) #plt.ylim([-0.1,1.2]) plt.xlabel('False Positive Rate -->') plt.ylabel('True Positive Rate -->') plt.show() ######## Applying DeepLearning LSTM from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer() tfidf.fit(result['Reviews']) ########################################################### from wordcloud import STOPWORDS from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer stopwords = set(STOPWORDS) stopwords.remove("not") count_vect = CountVectorizer(min_df=2, stop_words=stopwords, ngram_range=(1, 2)) tfidf_transformer = TfidfTransformer() df_cv = count_vect.fit_transform(result["Reviews"]) df_tf = tfidf_transformer.fit_transform(df_cv) ################################################################# from sklearn.feature_extraction.text import CountVectorizer from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Embedding, LSTM from sklearn.model_selection import train_test_split import re max_fatures = 30000