def metatransformation(client, db, query, to_train=True): """ Arguments: Query: MongoDB query to_train: True: return a train and test dataset False: return only data to predict out of sample Step :1 - Extract the data from MonGoDb Step 2: - Exclude multi tickers Step :3 - take negation into account: - "not","no","none","neither","never" or “nobody” - Convert digit to "numbertag" - Remove @USER - Remove ticker - Remove special characters - Lower test Step 4: Remove stop words Step 5: Lemmanize Step 6: Train/test set """ text = pd.DataFrame(list(db.messages.find(query))) # Count stock text["count_stock"] = text["symbols"].apply(lambda x: len(x)) # Extract single count text = text[text["count_stock"].isin([1])] # take negation into account # Convert digit to "_digit" # Remove @USER # Remove unicode issue # Remove ticker # Remove all the special characters # remove all single characters # Remove Ya # Remove bitcoin # remove btc text["body_transform"] = text["body"].replace( regex={ r"\bnothing\b": "negtag_nothing", r"\bno\b": "negtag_no", r"\bnone\b": "negtag_none", r"\bneither\b": "negtag_neither", r"\bnever\b": "negtag_never", r"\bnobody\b": "negtag_nobody", r"\d+": "numbertag ", r"([@?])(\w+)\b": "user", r"\b&#\b": " ", r"[$][A-Za-z][\S]*": "", r"\W": " ", r"\s+[a-zA-Z]\s+": " ", r"\^[a-zA-Z]\s+": " ", r"\s+": " ", r"^b\s+": "", r"\bya\b": "", r"\bbitcoin\b": "", r"\bBitcoin\b": "", r"\bbtc\b": "", }) # Lower text["body_transform"] = text["body_transform"].str.lower() # Remove stop words stop = stopwords.words('english') text["body_transform"] = text["body_transform"].apply( lambda x: ' '.join([word for word in x.split() if word not in (stop)])) # Lemmatize lemmatizer = WordNetLemmatizer() w_tokenizer = nltk.tokenize.WhitespaceTokenizer() text["body_transform"] = text["body_transform"].apply(lambda x: " ".join( [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(x)])) # Split the dataset X_ = text["body_transform"] y_ = text["sentiment_"] count_ = text.groupby("sentiment")["sentiment"].count() print("The shape of the data is {}, and {}".format(text.shape, count_)) if to_train: X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.1, random_state=0) return X_train, X_test, y_train, y_test else: return X_
''' import pandas as pd import nltk from nltk import pos_tag, RegexpParser, Tree from nltk.tokenize import word_tokenize from nltk.corpus import wordnet from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from collections import defaultdict import re import string import config lemmatizer = WordNetLemmatizer() stopword = list( set(stopwords.words('english')).difference(config.REMOVE_FROM_STOPWORDS)) stopword.extend(config.STOPWORDS_EXTEND) indicators = config.INDICATOR_LIST def text_cleaner(sent): ''' membersihkan tanda baca, dan mengoreksi singkatan (syntactical noise) ''' sent = sent.lower() sent = re.sub(r"\'s", " is ", sent) # sent = re.sub(r"\'", "", sent) sent = re.sub(r"@", " ", sent) sent = re.sub(r"\'ve", " have ", sent)
def __init__(self, data, column_name): self.data = data self.column_name = column_name self.stemmer = PorterStemmer() self.lemmatiser = WordNetLemmatizer()
def __init__(self): nltk.download('wordnet') nltk.download('stopwords') self.wordnet_lemmatizer = WordNetLemmatizer()
import nltk from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords paragraph = """ My biological mother found out later that my mother had never graduated from college and that my father had never graduated from high school. She refused to sign the final adoption papers. She only relented a few months later when my parents promised that I would go to college. This was the start in my life. And 17 years later I did go to college. But I naively chose a college that was almost as expensive as Stanford, and all of my working-class parents’ savings were being spent on my college tuition. After six months, I couldn’t see the value in it. I had no idea what I wanted to do with my life and no idea how college was going to help me figure it out. And here I was spending all of the money my parents had saved their entire life. So I decided to drop out and trust that it would all work out OK. It was pretty scary at the time, but looking back it was one of the best decisions I ever made. The minute I dropped out I could stop taking the required classes that didn’t interest me, and begin dropping in on the ones that looked far more interesting.It wasn’t all romantic. I didn’t have a dorm room, so I slept on the floor in friends’ rooms, I returned coke bottles for the $0.05 deposits to buy food with, and I would walk the 7 miles across town every Sunday night to get one good meal a week at the Hare Krishna temple. I loved it. """ sentences = nltk.sent_tokenize(paragraph) ## converted paragraph into sentences lemmatizer = WordNetLemmatizer() ## created lemmatizer object # Lemmatization Process for i in range(len(sentences)): words = nltk.word_tokenize(sentences[i]) words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))] sentences[i] = ' '.join(words)
def tokenize(fileItem: list) -> None: ps = PorterStemmer().stem wnl = WordNetLemmatizer() lem = wnl.lemmatize lemmaCache = dict() tokenDict = dict() filePath = fileItem[1] docID = int(fileItem[0]) with open(filePath, 'r') as content_file: textContent = content_file.read() jsonOBJ = json.loads(textContent) htmlContent = jsonOBJ["content"] # initialize BeautifulSoup object and pass in html content soup = BeautifulSoup(htmlContent, 'html.parser') # Deletes HTML comments, javascript, and css from text for tag in soup(text=lambda text: isinstance(text, Comment)): tag.extract() for element in soup.findAll(['script', 'style']): element.extract() # Collect all words found from html response WITH TAGS IN A TUPLE WITH EACH WORD ('word', 'tag') # Tags below are in order of importance/weight tagNamesList = [ 'title', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'strong', 'b', 'a', 'p', 'span', 'div' ] tagsTextList = [] for tag in tagNamesList: tagsTextList.append(soup.find_all(tag)) ##### REDIS ONLY START ##### # urlContent = jsonOBJ["url"] # # return if html text has identical hash # # Add all tokens found from html response with tags removed # varTemp = soup.get_text() # if util.isHashSame(varTemp): # util.addDuplicateURL(docID, urlContent) # return # # Add unique url to redis # util.addUniqueURL(docID, urlContent) ##### REDIS ONLY END ##### taggedTextDict = dict() for i, tagSubList in enumerate(tagsTextList): taggedTextDict[tagNamesList[i]] = list() for phrase in tagSubList: for word in re.split(r"[^a-z0-9']+", phrase.get_text().lower()): taggedTextDict.get(tagNamesList[i]).append(word) # Store words as tokens in tokenDict, ignore words that are bad for tag, wordList in taggedTextDict.items(): for word in wordList: if (len(word) == 0): # ignore empty strings continue if ( len(word) > 30 and tag != 'a' ): # ignore words like ivborw0kggoaaaansuheugaaabaaaaaqcamaaaaolq9taaaaw1bmveuaaaacagiahb0bhb0bhr0ahb4chh8dhx8eicifisiukt4djzankywplcwhltkfpl8nn0clpvm9qumvvxu8wnvbrezesepkyxvwzxbpbnjqb3jtcxruc3vvdxhzdnhyehtefjvdf5xtjkv continue # But accept any URLs that may be large if (word[0] == "'"): # ignore words that start with ' continue if (len(word) == 1 and word.isalpha()): # ignore single characters continue # will not change numbers/digits # lemmatized things that are 3 letter or greater if not any(char.isdigit() for char in word) and len(word) > 2 and word not in lemmaCache: # Lemmatization of a word with a number is usually itself. # lemmatization of in, on, as, is usually itself. # Checking for the above and if word is not already cached saves time. # gets the part of speech or a word, to make lemmatization more accurate pos = tag_map[pos_tag((word, ))[0][1][0]] lemWord = lem(word, pos) # lemmatized word #catches words that lemmatization misses and porter stemmer in its place if word[-2:] == "ly" or word[-4:] == "ness" or word[ -3:] == "ish": # Catches any ly, ness, or ish that lemmatize doesnt catch. Words are less accurate, but cuts off extraneous words. lemWord = ps(word) lemmaCache[word] = lemWord else: lemmaCache[word] = word # the lemma of the word is itself if lemmaCache[word] in tokenDict: tokenDict.get(lemmaCache[word]).incFreq() else: tokenDict[lemmaCache[word]] = Posting(docID, 1, tag) if len( lemmaCache ) > 5000000: # Save up to 5million tokens, and then clear to prevent too much memory error lemmaCache.clear() # Write tokens and their Postings to a text file ("store on disk") buildIndex(tokenDict)
csv_f=csv.reader(f) Abstracts_2011=[] for row in csv_f: Abstracts_2011.append(row[0]) Abstracts_str_2011=' '.join(Abstracts_2011) #Clean and return stopwords from 2011 abstracts alpha_only_2011 = re.sub("[^a-zA-Z]", " ", Abstracts_str_2011) words_2011 = alpha_only_2011.lower().split() #Was testing stopword removal with counter #counter1=Counter(words) meaningful_words_2011 = [w for w in words_2011 if not w in stops] #Was testing stopword removal with counter #counter2=Counter(meaningful_words) patent_lemmatizer=WordNetLemmatizer() lemmatized_words_2011 = patent_lemmatizer.lemmatize({(meaningful_words_2011)}) #Read 2012 Abstracts from CSV file g = open('PowerAbstracts_csv_2012.csv') csv_g=csv.reader(g) Abstracts_2012=[] for row in csv_g: Abstracts_2012.append(row[0]) Abstracts_str_2012=' '.join(Abstracts_2012) #Clean and return stopwords from 2012 abstracts alpha_only_2012 = re.sub("[^a-zA-Z]", " ", Abstracts_str_2012) words_2012 = alpha_only_2012.lower().split() #Was testing stopword removal with counter #counter1=Counter(words)
import nltk from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer import numpy as np import random import pickle from collections import Counter positive_file_path = 'Data/positive.txt' negative_file_path = 'Data/negative.txt' myStemmer = WordNetLemmatizer() n_lines = 10000000 def create_vocab(pos_file, neg_file): all_tokens = [] for f in [pos_file, neg_file]: with open(f, 'r') as f: sentences = f.readlines() for s in sentences[:n_lines]: words_in_sentence = word_tokenize(s.lower()) all_tokens += list(words_in_sentence) all_tokens = [myStemmer.lemmatize(i) for i in all_tokens] unique_words = Counter(all_tokens) vocab = [] for w in unique_words: if 1000 > unique_words[w] > 50: vocab.append(w) print("The size of the vocab is:", len(vocab)) return vocab
def prep_text(df, model, datapath='./data/', stemming=0): """ 0. replace NA/missing values with "" 1. filter the non english patents for now, 2. remove punctuations, 3. stop words 4. lemmatize 5. stemming? and what else? :return: """ print("Embeddings loaded, Preparing data for semantic analysis...") #0. replace NA/missing values with "" df.fillna('', inplace=True) #1. filter the non english patents for now, print ("No Multiligual support for semantic search yet, Please give English inputs only.") df = df[df.lang=='en'] print (df.shape[0], "english patents exist.") # combine all text df['text'] = df[['titles', 'abstract', 'descriptions', 'claims']].values.tolist() # 2. remove punctuations, print("Removing punctuations") df['text'] = df.text.apply(lambda text: str(text)) df['text'] = df.text.apply(word_tokenize) df['text'] = df.text.apply(lambda text: [word for word in text if word.isalpha()]) # filter out stop words from all languages print("Removing stopwords...") from nltk.corpus import stopwords stop_words = set(stopwords.words('english')) stop_words_fr = set(stopwords.words('french')) stop_words_de = set(stopwords.words('german')) stop_words.update(stop_words_de) stop_words.update(stop_words_fr) df['text'] = df.text.apply(lambda text: [w for w in text if not w in stop_words]) # converts the word to its meaningful base form, infer the POS automatically print("Lemmatizing...") lemmatizer = WordNetLemmatizer() df['text'] = df.text.apply(lambda text: [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in text]) # stemming of words if stemming: porter = PorterStemmer() df['text'] = df.text.apply(lambda text: [porter.stem(word) for word in text]) # load model and filter words that exist in our pretrained word2vec model print("Infering semantics") df['text'] = df.text.apply(lambda text: [word for word in text if word in model.vocab]) df.dropna(inplace=True) df = df[df.text.apply(lambda text: len(text)>0)] print("Counting occurences") # count frequency of words df['freq_dict'] = df.text.apply(lambda text: dict(Counter(text))) # save in ftr to disk df = df.reset_index(drop=True) df.to_feather(datapath + 'df_tok_freq.ftr') print ("Dataframe saved to ", datapath + "df_tok_freq.ftr with columms", df.columns.to_list())
def __init__(self): self.wnl = WordNetLemmatizer()
def stemText(text): wordnet_lemmatizer = WordNetLemmatizer() stemmed = [] for word in text: stemmed.append(wordnet_lemmatizer.lemmatize(word)) return stemmed
def lemmatize_tweet(tweet): lemmatizer = WordNetLemmatizer() word_list = word_tokenize(tweet) result = '' result = ' '.join([lemmatizer.lemmatize(w) for w in word_list]) return result
def lemmatizationFunct(x): nltk.download('wordnet') lemmatizer = WordNetLemmatizer() finalLem = [lemmatizer.lemmatize(s) for s in x] return finalLem
def __init__(self): self.lemmatizer = WordNetLemmatizer() self.mystopwords = stopwords.words('english') + ['n\'t','wo']
def tokenize(text): tokens = word_tokenize(text) lemmatizer = WordNetLemmatizer() stemmer = PorterStemmer() stems = stem_lemmatize_tokens(tokens, lemmatizer, stemmer) return stems
abstract_process = re.sub(r'[\d]', '', abstract_process) abstract_process = abstract_process.lower() abstract_process = abstract_process.strip() word_tokens = abstract_process.split( ) #splits words from comments into list #Step 6: Map words to contracted word dictionary and substitute them word_tokens = [ APPOSTOPHES[word] if word in APPOSTOPHES else word for word in word_tokens ] '''remove any items in list that is empty as it causes error to pos_tag''' word_tokens = [w for w in word_tokens if len(w) > 0] #Step 7: Parts of Speech tagging of each word from nltk.stem import WordNetLemmatizer wnl = WordNetLemmatizer() new_word_token_list = [] new_word_token = nltk.pos_tag(word_tokens) new_word_token_list.append(new_word_token) [new_word_token_list] = new_word_token_list #Step 8: Lemmatize Tokenized Text post_lemm = [] for word, tag in new_word_token_list: wntag = tag[0].lower() wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None if not wntag: lemma = word post_lemm.append(lemma) else: lemma = wnl.lemmatize(word, wntag) post_lemm.append(lemma)
def lemmatize(chunklist): lemmalist=[] lemmatizer = WordNetLemmatizer() for i in range(len(chunklist)): lemmalist.append(' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(chunklist[i])])) return lemmalist
def __init__(self, decode_error="strict", strip_accents='unicode', lowercase=True, contractions=True,\ ignore_list=[], stopwords=None, remove_html=True, treat_urls="join", extract_phrases=True,\ treat_ner="replace_ner", lemmatize=False, stemming=True, spellcheck=False, tokenize=True,\ join_char="_", ): '''A comprehensive text pre-processing class ... Attributes ---------- contractions: bool whether to decontract words like you're -> you are ignore_list: list list of characters to remove from the text stopwords: set set of words to be considered as stopwords, if None then the stopwords won't be removed remove_html: bool remove html tags important for the text data scrapped from web pages treat_urls: str: how should be the urls present in the text be treated takes values from (join, remove, replace), if selected as replace: urls will be replaced with _url_ extract_phrases: extracts most common phrases from the text and joins them with a predefined character treat_ner: string extracts and treats named entities from the text replace: the entities will be replaced by a common token (_ner_) replace_ner: entities will be replaced by corresponding named entity eg. _person_, _location_ etc. join: entities token will be joined together by a joining character None: doesn't extract NER lemmatize: lemmatize the tokens stemming: tokens are stemmed spellcheck: should the spelling be corrected tokenize: text will be returned as a list of tokens Methods ------- Returns ------- ''' self.lowercase = lowercase self.decode_error = decode_error self.strip_accents = strip_accents self.contractions = contractions self.ignore_list = ignore_list self.stopwords = stopwords self.remove_html = remove_html self.treat_urls = treat_urls self.extract_phrases = extract_phrases self.treat_ner = treat_ner self.lemmatize = lemmatize self.stemming = stemming self.spellcheck = spellcheck self.tokenize = tokenize self.join_char = join_char self.compound_pattern = re.compile(r'\w+(\-\w+)+') #here-there self.stanford_ner = load_stanford_ner_tagger("stanford_ner_path") self.spell_checker = SpellChecker() self.stemmer = SnowballStemmer("english") self.lemmatizer = WordNetLemmatizer() disable = ['parser'] if self.treat_ner == None: disable.append('ners') self.nlp = spacy.load('en_core_web_sm', disable=disable)
import numpy as np import re import nltk from sklearn.datasets import load_files from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from sklearn.feature_extraction.text import CountVectorizer from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, confusion_matrix, accuracy_score nltk.download('stopwords') nltk.download('wordnet')temporal_data = load_files(r"txt_sentoken") X, y = temporal_data.data, temporal_data.targetdocuments = [] stemmer = WordNetLemmatizer()# Pre-processing tasks for sen in range(0, len(X)): # Remove all the special characters document = re.sub(r'\W', ' ', str(X[sen])) # remove all single characters document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document) # Remove single characters from the start document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) # Substituting multiple spaces with single space document = re.sub(r'\s+', ' ', document, flags=re.I) # Removing prefixed 'b' document = re.sub(r'^b\s+', '', document) # Converting to Lowercase document = document.lower() # Lemmatization document = document.split() document = [stemmer.lemmatize(word) for word in document] document = ' '.join(document) documents.append(document)# Bag of Words model to convert text documents into numerical features vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english')) X = vectorizer.fit_transform(documents).toarray()# Training and testing splits X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) classifier = RandomForestClassifier(n_estimators=1000, random_state=0) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test)# Evaluating the model print(confusion_matrix(y_test,y_pred))
def main(index_dir, silent, context_size, folder, use_gender): doc_path = os.path.join(index_dir, 'docs.list') lex_path = os.path.join(index_dir, 'words.lex') idx_path = os.path.join(index_dir, 'index.bin') documents = Documents.load(doc_path) lexicon = Lexicon.load(lex_path) words = get_lexicon() stop_words = set( list(STOP_WORDS) + [ "know", "don", "ve", "say", "way", "said", "ll", "think", "thing", "don’t", "like", "got", "people", "going", "talk", "right", "happened", ">>" ]) print("Stop words", stop_words) doc_idxs = range(144, 246923) word_idx_dic = {} idx_counter = 0 # Create folder if not os.path.exists(folder): os.makedirs(folder) # Create stemmer stemmer = WordNetLemmatizer() with CaptionIndex(idx_path, lexicon, documents) as index: for doc_id in tqdm.tqdm(doc_idxs): dic = {} count = 1 if use_gender: intervals_gender = gender_to_time(str(doc_id), gender_reqs) postings = [] for t1, t2 in intervals_gender: postings.extend(index.intervals(int(doc_id), t1, t2)) else: postings = index.intervals(int(doc_id)) starttime = None for p in postings: if starttime is None: starttime = p.start # Cut after 30s if p.end - starttime > 30 * count: pickle.dump( dic, open( os.path.join( folder, 'Doc_%d_Chunk_%d.p' % (doc_id, count - 1)), 'wb')) dic = {} count += 1 starttime = p.end # Get words in posting tokens = index.tokens(0, p.idx, p.len) if not tokens: continue for token in tokens: word = words[token] # stemmed_word = stemmer.stem(word) if word not in stop_words and len(word) > 1: stemmed_word = stemmer.lemmatize(word) # print("Word {} -> {}".format(word, stemmed_word)) if stemmed_word not in word_idx_dic.keys(): word_idx_dic[stemmed_word] = idx_counter idx_counter += 1 idx_token = word_idx_dic[stemmed_word] if idx_token in dic: dic[idx_token] += 1 else: dic[idx_token] = 1 pickle.dump(word_idx_dic, open(os.path.join(folder, "word_idx.p"), "wb"))
def __init__(self, stem=False): self.wnl = WordNetLemmatizer() if stem: self.stemmer = SnowballStemmer('english') else: self.stemmer = Bunch(stem=lambda x: x)
def preprocess_text(text): lemmatizer = WordNetLemmatizer() tokenized = word_tokenize(text) return [lemmatizer.lemmatize(token) for token in tokenized]
def senti_features(corpus): tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True) lemma = WordNetLemmatizer() NEGATE = \ ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt", "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't", "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither", "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't", "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere", "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent", "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't", "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"] X = [] afinn = Afinn(emoticons=True) analyzer = SentimentIntensityAnalyzer() for line in corpus: token = tokenizer.tokenize(line) token = [ word for word in token if word not in stopwords.words('english') ] #token = [porter.stem(i.lower()) for i in token] token = [lemma.lemmatize(word) for word in token] poseachtweet = [] negeachtweet = [] poseachtweet1 = [] negeachtweet1 = [] neutral = 0 prev_neg = 0 for lem in token: a, b = 0, 0 syn = list(swn.senti_synsets(lem)) for sy in syn: a += sy.pos_score() b += sy.neg_score() if (len(syn) != 0): a = a / len(syn) b = b / len(syn) if prev_neg == 1: a, b = b, a poseachtweet.append(a) negeachtweet.append(b * -1) sc = afinn.score(lem) if prev_neg == 1: sc = sc * -1 prev_neg = 0 if sc > 0: poseachtweet1.append(sc) negeachtweet1.append(0) elif sc < 0: negeachtweet1.append(sc) poseachtweet1.append(0) else: negeachtweet1.append(0) poseachtweet1.append(0) if lem in NEGATE: prev_neg = 1 max_pos = 0 max_neg = 0 imbal = 0 avg_pos = 0 avg_neg = 0 pol = 0 contrast = 0 max_pos1 = 0 max_neg1 = 0 imbal1 = 0 avg_pos1 = 0 avg_neg1 = 0 pol1 = 0 contrast1 = 0 tweetscore1 = 0 avg_pos2 = 0 avg_neg2 = 0 pol2 = 0 contrast2 = 0 polarity = 0 subjectivity = 0 if (len(token) != 0): max_pos = max(poseachtweet) max_neg = min(negeachtweet) imbal = max_pos + max_neg avg_pos = np.count_nonzero(poseachtweet) / len(token) avg_neg = np.count_nonzero(negeachtweet) / len(token) pol = sum(poseachtweet) + sum(negeachtweet) if (max_pos != 0) and (max_neg != 0): contrast = 1 max_pos1 = max(poseachtweet1) max_neg1 = min(negeachtweet1) imbal1 = max_pos1 + max_neg1 avg_pos1 = np.count_nonzero(poseachtweet1) / len(token) avg_neg1 = np.count_nonzero(negeachtweet1) / len(token) pol1 = sum(poseachtweet1) + sum(negeachtweet1) if (max_pos1 != 0) and (max_neg1 != 0): contrast1 = 1 tweetscore1 = afinn.score(line) / len(token) vs = analyzer.polarity_scores(line) avg_pos2 = vs['pos'] avg_neg2 = vs['neg'] pol2 = vs['compound'] if (avg_pos2 != 0) and (avg_neg2 != 0): contrast2 = 1 polarity = TextBlob(str(line)).sentiment.polarity subjectivity = TextBlob(str(line)).sentiment.subjectivity X.append([ int(contrast), float(avg_pos), float(avg_neg), float(imbal), float(pol), int(contrast1), float(tweetscore1), float(avg_pos2), float(avg_neg2), float(polarity), float(subjectivity) ]) return X
def lemmatize(token): """Returns lemmatization of a token""" return WordNetLemmatizer().lemmatize(token, pos='v')
def my_clean(text, stops=False, stemming=False): text = str(text) text = re.sub(r" US ", " american ", text) text = text.lower().split() text = " ".join(text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"don't", "do not ", text) text = re.sub(r"aren't", "are not ", text) text = re.sub(r"isn't", "is not ", text) text = re.sub(r"%", " percent ", text) text = re.sub(r"that's", "that is ", text) text = re.sub(r"doesn't", "does not ", text) text = re.sub(r"he's", "he is ", text) text = re.sub(r"she's", "she is ", text) text = re.sub(r"it's", "it is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) text = text.lower().split() text = [w for w in text if len(w) >= 2] if stemming and stops: text = [ word for word in text if word not in stopwords.words('english') ] wordnet_lemmatizer = WordNetLemmatizer() englishStemmer = SnowballStemmer("english", ignore_stopwords=True) text = [englishStemmer.stem(word) for word in text] text = [wordnet_lemmatizer.lemmatize(word) for word in text] text = [ word for word in text if word not in stopwords.words('english') ] elif stops: text = [ word for word in text if word not in stopwords.words('english') ] elif stemming: wordnet_lemmatizer = WordNetLemmatizer() englishStemmer = SnowballStemmer("english", ignore_stopwords=True) text = [englishStemmer.stem(word) for word in text] text = [wordnet_lemmatizer.lemmatize(word) for word in text] text = " ".join(text) return text
def example_three(): lemmatizer = WordNetLemmatizer() lemmatized = [lemmatizer.lemmatize(w, 'v') for w in text1] return len(set(lemmatized))
import nltk import re nltk.download('punkt') nltk.download('stopwords') import pandas as pd from gensim.models import Word2Vec from nltk.corpus import stopwords import re from textblob import TextBlob from spellchecker import SpellChecker from nltk.stem.porter import PorterStemmer from nltk.stem import WordNetLemmatizer spell = SpellChecker() ps = PorterStemmer() wordnet = WordNetLemmatizer() def speak(text): try: a = TextBlob(text).correct() except: a = text finally: return (str(a)) df = pd.read_csv('ICD_CODES.tsv', sep='\t') df['Issue'] = df['CODES'].str.split(n=1).str[1] df['CODES'] = df.CODES.str.split().str.get(0) df['Issuew2v'] = df['Issue'] df['Issuetextblob'] = df['Issue']
# Adjectives AJ: { <CA>(<CC>?<CA>)* } # Entities EN: {<AJ>?<NN.*|FW>+} # Noun-phrases NP: {<DT>?<CC>?(<CC><CD>)*<EN>(<CC>?<EN>)*} # Rest should be considered as a Verb-Phrase Chunk VP: {<.*>+} }<NP>+{ ''' PARSER = RegexpParser(GRAMMAR) LEMMATIZER = WordNetLemmatizer() STOPWORDS = stopwords.words('english') class TextParser: @staticmethod def calculate_similarity(a, b) -> float: return SequenceMatcher(None, a, b).ratio() @staticmethod def generate_pos_tag_sets(input_string: str) -> next: """ Break given string into sentences, and return their pos-tagged lists.\n **REQUIRES AN ACTIVE POS TAGGER TO BE RUNNING!!** :param input_string: input string. may contain one or more sentences """
if days < 0: creditScore['days_employed'] = creditScore['days_employed'].replace(days, 0) creditScore['years_employed'] = (creditScore['days_employed']/365).astype(int).round() # change float type to int for total_income creditScore['total_income'] = creditScore['total_income'].astype(int) #handle duplicates in education creditScore['education'] = creditScore['education'].str.lower() #handle duplicates in purpose import nltk from nltk.stem import WordNetLemmatizer wordnet_lemmma = WordNetLemmatizer() for purpose in creditScore['purpose']: words = nltk.word_tokenize(purpose) if 'education' in words or 'university' in words or 'educated' in words: creditScore['purpose'].replace(purpose, 'education',inplace=True) if 'car' in words or 'cars' in words: creditScore['purpose'].replace(purpose,'car',inplace=True) if 'house' in words or 'housing' in words or 'estate' in words or 'property' in words: creditScore['purpose'].replace(purpose,'real estate',inplace=True) if 'wedding' in words: creditScore['purpose'].replace(purpose,'wedding',inplace=True)
def __init__(self): self.list_of_categories = ["sports", "health", "religion", "politics", "technology", "science", "culture", "travel", "food", "business"] self.lemmatizer = WordNetLemmatizer() file = open("classifier", "rb") self.clf = pickle.load(file)