def strict_preprocessor(corpus): # strip non-ascii characters tokenized = [i for i in word_tokenize(corpus) if not match(r'(^\W*$)', i)] lemmatizer = WordNetLemmatizer() return [lemmatizer.lemmatize(i) for i in tokenized]
from qa_engine.score_answers import main as score_answers import nltk, operator from nltk.corpus import wordnet as wn from word2vec_extractor import Word2vecExtractor from nltk import WordNetLemmatizer from collections import defaultdict from sklearn.metrics.pairwise import cosine_similarity """ GLOBALS """ STOPWORDS = set(nltk.corpus.stopwords.words("english")) glove_w2v_file = "data/glove-w2v.txt" DATA_DIR = "./wordnet" W2vecextractor = Word2vecExtractor(glove_w2v_file) lmtzr = WordNetLemmatizer() """ UTILITY FUNCTIONS """ # The standard NLTK pipeline for POS tagging a document def get_sentences(text): sentences = nltk.sent_tokenize(text) sentences = [nltk.word_tokenize(sent) for sent in sentences] sentences = [nltk.pos_tag(sent) for sent in sentences] return sentences
@author: elliott """ from numpy import prod from collections import Counter, Set from nltk import sent_tokenize, ngrams, PorterStemmer, SnowballStemmer, WordNetLemmatizer from nltk.tag import perceptron import utils import phrase_similarity import numpy as np import pickle tagger = perceptron.PerceptronTagger() porter = PorterStemmer() snowball = SnowballStemmer('english') lemmatizer = WordNetLemmatizer() thermometers = [ 'democrats', 'republicans', 'protestants', 'catholics', 'jews', 'blacks', 'whites', 'southerners', 'big business', 'labor unions', 'liberals', 'conservatives', 'military', 'policemen', 'black militants', 'civil rights leaders', 'chicanos hispanics', 'democratic party', 'middle class people', 'people on welfare', 'political independents', 'political parties', 'poor people', 'republican party', 'womens right activist', 'young people', 'asian-americans', 'congress', 'environmentalists', 'anti abortionists', 'federal government', 'illegal aliens', 'christian fundamentalists', 'radical students', 'farmers', 'feminists', 'evangelical groups', 'elderly', 'supreme court', 'women' ]
def lemmatize_stemming(self, token): return SnowballStemmer("english").stem(WordNetLemmatizer().lemmatize(token, pos='v'))
def __init__(self): self.lemmatizer = WordNetLemmatizer()
def clean_text(text): import nltk nltk.download('stopwords') nltk.download('wordnet') # split into words by white space words = text.split() # remove punctuation from each word import string table = str.maketrans('', '', string.punctuation) text = [w.translate(table) for w in words] text = " ".join(text) #print(stripped[:100]) ## Remove puncuation #text = text.translate(string.punctuation) ######################################################################################## # replace urls re_url = re.compile( r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\ .([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*", re.MULTILINE | re.UNICODE) # replace ips re_ip = re.compile("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}") # replace URLs text = re_url.sub("URL", text) # replace IPs text = re_ip.sub("IPADDRESS", text) #################################################################### ## Convert words to lower case and split them text = text.lower().split() ## Remove stop words #stops = set(stopwords.words("english")) #text = [w for w in text if not w in stops and len(w) >= 3] text = " ".join(text) ## Clean the text text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) ## Stemming text = text.split() #stemmer = SnowballStemmer('english') #stemmed_words = [stemmer.stem(word) for word in text] lemmatizer = WordNetLemmatizer() lemmatized_words = [lemmatizer.lemmatize(word) for word in text] text = " ".join(lemmatized_words) return text
def lemmatize_stemming(text): return PorterStemmer().stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def __init__(self): self.wnl = WordNetLemmatizer()
def __init__(self, model): self.model = model self.lemmatizer = WordNetLemmatizer() self.intents = json.loads(open('intents.json').read()) self.words = pickle.load(open('words.pkl', 'rb')) self.classes = pickle.load(open('classes.pkl', 'rb'))
def feature_maker(embed_file, dataframe, embed_signal='n'): '''takes a path to embeddings file, dataframe as input - default keyword embed-signal means that embeddings are not encoded by default returns an expanded dataframe with: a column of lemmatised words; a column of stemmed words; a column indicating capitalisation status; a column indicating capilatisation status of previous token; columns indicating shape, previous shape, short shape, previous short shape, following token short shape. If kwarg embed_signal is 'y', a list of embeddings is also generated. ''' wnl = WordNetLemmatizer() prtr = PorterStemmer() stringed_list = [str(x) for x in dataframe['token']] wn_lemma_list = [wnl.lemmatize(t) for t in stringed_list] dataframe['lemma'] = wn_lemma_list prtr_stemmer_list = [prtr.stem(t) for t in stringed_list] dataframe['stem'] = prtr_stemmer_list dataframe['caps'] = 'no caps' dataframe.loc[dataframe['token'].str.contains('^[A-Z][a-z]'), ['caps']] = 'begin_cap' dataframe.loc[dataframe['token'].str.contains('[A-Z][A-Z]'), ['caps']] = 'all_caps' dataframe.loc[dataframe['token'].str.contains('[a-z][A-Z]]'), ['caps']] = 'caps_inside' temp_list = dataframe['caps'].to_list() temp_list.insert(0, 'no_cap') temp_list.pop() dataframe['prev_caps'] = temp_list dataframe['short_shape'] = 'x' dataframe.loc[dataframe['token'].str.contains('^[A-Z][a-z]'), ['short_shape']] = 'Xx' dataframe.loc[dataframe['token'].str.contains('[A-Z][A-Z]'), ['short_shape']] = 'XX' dataframe.loc[dataframe['token'].str.contains('[a-z][A-Z]]'), ['short_shape']] = 'xXx' dataframe.loc[dataframe['token'].str.contains('\W'), ['short_shape']] = '-' prev_short_shape_list = [] prev_short_shape_list = dataframe['short_shape'].to_list() prev_short_shape_list.insert(0, '-') prev_short_shape_list.pop() dataframe['prev_short_shape'] = prev_short_shape_list next_short_shape_list = [] next_short_shape_list = dataframe['short_shape'].to_list() next_short_shape_list.pop(0) next_short_shape_list.append('-') dataframe['next_short_shape'] = next_short_shape_list shape_list = [] pre_list = [] suf_list = [] for text in dataframe['token']: prefix = text[:3] suffix = text[-3:] pre_list.append(prefix) suf_list.append(suffix) replace_caps = re.sub('[A-Z]', 'X', text) replace_lowers = re.sub('[a-z]', 'x', replace_caps) replace_digits = re.sub('\d', 'd', replace_lowers) shape_list.append(replace_digits) dataframe['shape'] = shape_list prev_shape_list = [] prev_shape_list = dataframe['shape'].to_list() prev_shape_list.insert(0, '-') prev_shape_list.pop() dataframe['prev_shape'] = prev_shape_list dataframe['prefix'] = pre_list dataframe['suffix'] = suf_list if embed_signal == 'y': word_embedding_model = gensim.models.KeyedVectors.load_word2vec_format( embed_file, binary=True) embeddings = [] for token in dataframe['token']: if token in word_embedding_model: vector = word_embedding_model[token] else: vector = [0] * 300 embeddings.append(vector) return dataframe, embeddings else: return dataframe
def lemmatize(word, pos): global lemmer if lemmer is None: lemmer = WordNetLemmatizer() return lemmer.lemmatize(word, get_wordnet_pos(pos))
def test_main(): test_data = pd.read_csv( r'C:\Users\Lovely\PycharmProjects\fake_news_classifier\data\raw\test.csv' ) test_data.head() test_data.isnull().sum() test_data.info() test_data.dropna(inplace=True) print("null count in data \n", test_data.isnull().sum()) test_data.drop(['id', 'author'], axis=1, inplace=True) print(test_data.head()) # # cleaning test.csv data def data_cleaning(raw_data): #print(raw_data) raw_data = str(raw_data) raw_data = re.sub(r'\W', " ", raw_data) #Removing non-word character raw_data = re.sub(r'[0-9]', " ", raw_data) raw_data = re.sub(r'\s+', " ", raw_data) # Removing xtra space raw_data = raw_data.lower() return raw_data test_data['text'] = test_data['text'].apply(lambda x: data_cleaning(x)) test_data['title'] = test_data['title'].apply(lambda x: data_cleaning(x)) # As we have combined text and title in our train.csv and trained model on it accordingly , we will follow same thing here to avoid data loss and make data format same test_data['text'] = test_data['title'] + test_data['text'] lm = WordNetLemmatizer() data_collect1 = [] stoplist = set(nltk.corpus.stopwords.words("english")) def data_preprocessing_test(raw_data): #raw_data=str(raw_data) words = word_tokenize(raw_data) words = [ word for word in words if word not in stoplist and word not in string.punctuation ] words = [word for word in words if len(word) > 1] words = [lm.lemmatize(word) for word in words] words = " ".join(words) data_collect1.append(words) return words test_data['text'] = test_data['text'].apply( lambda x: data_preprocessing_test(x)) test_data['text'].head() voc_words1 = 8000 c = 0 for word in data_collect1: c += len(set(word)) print(c) # # Making test.csv in model input data format so that we can predict on it one_hot_repr1 = [one_hot(word, voc_words1) for word in data_collect1] one_hot_repr1[:5] sent_len = 50 padded_data1 = pad_sequences(one_hot_repr1, padding='pre', maxlen=sent_len) padded_data1[0] x_test_data = np.array(padded_data1) print("priting x_test_data \n", x_test_data) # # Loading our Trained model from pickle file filename = r'C:\Users\Lovely\PycharmProjects\fake_news_classifier\src\models\model_pickle.sav' loaded_model = pickle.load(open(filename, 'rb')) model = loaded_model # # Model Prediction on test data y_pred1 = model.predict_classes(x_test_data) print("predicted data shape \n", y_pred1.shape) print("predicted data :\n ", y_pred1) from pandas.core.common import flatten y_pred1 = list(flatten(y_pred1)) #making it 2D to 1D result = pd.Series(y_pred1, name='label') result.unique() output = pd.concat([pd.Series(range(1, 4576), name="Id"), result], axis=1) print("saving output to submit file\n") output.to_csv( r"C:\Users\Lovely\PycharmProjects\fake_news_classifier\data\processed\submit.csv", index=False) submit_data = pd.read_csv( r"C:\Users\Lovely\PycharmProjects\fake_news_classifier\data\processed\submit.csv" ) submit_data.tail(10) print(submit_data.info()) print("Top 5 data of Submit file :\n") print(submit_data.head())
def __init__(self): self.w = WordNetLemmatizer() self.cache = MemoryCache()
from nltk import LancasterStemmer lnst = LancasterStemmer() for words in words_stem: print(words + " :" + lnst.stem(words)) from nltk import SnowballStemmer snl = SnowballStemmer("english") for words in words_stem: print(words + " :" + snl.stem(words)) # lemmetizing from nltk import WordNetLemmatizer wordnet = WordNetLemmatizer() for words in words_stem: print(words + " :" + wordnet.lemmatize(words)) # pos parts of speech # stopwords from nltk.corpus import stopwords stopwords.words("english") print(len(stopwords.words("english"))) import re punctuation = re.compile(r'[-.?,:;()|{}|0-9]') post_punctuation = []
def preprocess(sentence): lemmatizer = WordNetLemmatizer() return [ lemmatizer.lemmatize(word.lower()) for word in word_tokenize(unicode(sentence, errors='ignore')) ]
def generate_part2_dict(ibex_data, unique_id): """Given an ibex results file, returns a dictionary of the following format -- mystery word: [target, highest rated guess, lowest rated guess]""" Lemmy = WordNetLemmatizer() with open(ibex_data, 'rb+') as ibex_data: ibex_data = csv.reader( filter(lambda data_row: data_row[0] != '#', ibex_data)) ibex_data = list(ibex_data) subject_id = unique_id subject_age = ibex_data[1][8] subject_sex = ibex_data[2][8] ibex_data = filter(lambda row: row[5] != 'end', ibex_data) ibex_data = filter(lambda row: row[5] != 'intro3', ibex_data) ibex_data = [[x.lower() for x in y] for y in ibex_data] subj_dict = {} guess_and_confidence = [] previous_line = ['', '', '', '', '', '', '', '', ''] trial_identifier = 5 mystery_word, target_word, guess, confidence = 0, 1, 2, 2 for current_line in ibex_data: # print "Current line:" + str(current_line) if current_line[trial_identifier] == previous_line[ trial_identifier]: # print "match" current_line_info = current_line[trial_identifier].split("_") previous_line_info = previous_line[trial_identifier].split("_") current_line_info = [x.lower() for x in current_line_info] previous_line_info = [x.lower() for x in previous_line_info] if (current_line_info[target_word], current_line_info[mystery_word]) not in subj_dict: subj_dict[(current_line_info[target_word], current_line_info[mystery_word])] = [ (previous_line[8], current_line[8]) ] else: if (current_line_info[target_word], current_line_info[mystery_word]) in subj_dict: subj_dict[(current_line_info[target_word], current_line_info[mystery_word])] += [ (previous_line[8], current_line[8]) ] previous_line = current_line if len(subj_dict) != 12: raise ValueError( "ERROR: subj_dict does not equal 12. Check input results file") part_2_dict = defaultdict(list) # initialize a new dictionary for tracking some stats about the subject responses response_stats = defaultdict(list) for target_w_mystery_w, g_c_list in subj_dict.iteritems(): g_c_reversed = reversed(g_c_list) g_c_reversed = list(g_c_reversed) guesses = [] correct_answer_alternate_form = False for gc in g_c_reversed: lemmatized_guess = Lemmy.lemmatize( gc[0].strip().decode('unicode_escape').encode( 'ascii', 'ignore'), pos='n') lemmatized_guess = lemmatized_guess.encode('utf-8') for k, v in correct_answers.iteritems(): if lemmatized_guess in v: correct_answer_alternate_form = lemmatized_guess lemmatized_guess = k guesses.append((lemmatized_guess, gc[1])) guesses = [(x[0], int(x[1])) for x in guesses] # find if the target word was guessed during learning # and, find the highest confidence for that guess # and, find the number of times it was guessed target_guessed = 0 target_highest_confidence = 'NA' target_n_times_guessed = 'NA' if correct_answer_alternate_form: target_guessed = 1 target_highest_confidence = max(x[1] for x in guesses if x[0] == lemmatized_guess) target_n_times_guessed = sum(x[0] == lemmatized_guess for x in guesses) elif target_w_mystery_w[0] in [x[0] for x in g_c_reversed]: target_guessed = 1 target_highest_confidence = max( x[1] for x in guesses if x[0] == target_w_mystery_w[0]) target_n_times_guessed = sum(x[0] == target_w_mystery_w[0] for x in guesses) response_stats[target_w_mystery_w[0]] = [ target_guessed, target_highest_confidence, target_n_times_guessed ] guesses = [gc for gc in guesses if gc[0] != target_w_mystery_w[0]] if not guesses: guesses = [ (random.choice(frequent_words), random.randint(1, 5)), (random.choice(frequent_words), random.randint(1, 5)), (random.choice(frequent_words), random.randint(1, 5)) ] highest_confidence = max(x[1] for x in guesses) lowest_confidence = min(x[1] for x in guesses) highest_guesses = map( lambda x: x if x[1] >= highest_confidence else None, guesses) lowest_guesses = map( lambda x: x if x[1] <= lowest_confidence else None, guesses) highest_guesses = (x for x in highest_guesses if x is not None) lowest_guesses = (x for x in lowest_guesses if x is not None) highest_guess = next(highest_guesses, None) lowest_guess = next(lowest_guesses, None) highest_guess = highest_guess[0] lowest_guess = lowest_guess[0] if highest_guess == lowest_guess: # print "high-low match" lowest_guess = next(lowest_guesses, None) lowest_guess = lowest_guess[0] if type( lowest_guess) is tuple else None highest_guessed = 0 highest_guess_highest_confidence = 'NA' highest_guess_n_times_guessed = 'NA' lowest_guessed = 0 lowest_guess_highest_confidence = 'NA' lowest_guess_n_times_guessed = 'NA' if highest_guess in [x[0] for x in guesses]: highest_guessed = 1 highest_guess_highest_confidence = max( x[1] for x in guesses if x[0] == highest_guess) highest_guess_n_times_guessed = sum(x[0] == highest_guess for x in guesses) if lowest_guess in [x[0] for x in guesses]: lowest_guessed = 1 lowest_guess_highest_confidence = max(x[1] for x in guesses if x[0] == lowest_guess) lowest_guess_n_times_guessed = sum(x[0] == lowest_guess for x in guesses) response_stats[highest_guess] = [ highest_guessed, highest_guess_highest_confidence, highest_guess_n_times_guessed ] response_stats[lowest_guess] = [ lowest_guessed, lowest_guess_highest_confidence, lowest_guess_n_times_guessed ] response_stats['distractor'] = [0, 'NA', 'NA'] target_word = correct_answer_alternate_form if correct_answer_alternate_form else target_w_mystery_w[ 0] part_2_dict[target_w_mystery_w[1]] = [ target_word, highest_guess, lowest_guess ] # print subject_id return [part_2_dict, response_stats]
#========================================== # Author: Shierene Cervantes #========================================== # python script import pandas as pd import numpy as np import pylab as pl import nltk from nltk.corpus import stopwords from nltk.stem.lancaster import LancasterStemmer from nltk import WordNetLemmatizer stemmer = LancasterStemmer() wordnet_lemmatizer = WordNetLemmatizer() training_data = pd.read_csv('oneyearcategorized.csv', encoding = "latin1") training_data['Title'] = training_data['Title'].astype(str) training_data['Description'] = training_data['Description'].astype(str) training_data['title_and_description'] = training_data[['Title', 'Description']].apply(tuple, axis=1) training_data = training_data.astype(str) training_data['title_and_description'] = training_data['title_and_description'].astype(str) train_data = training_data.to_dict('records') train_data corpus_words ={} class_words = {} unnecessary_words = ['please','it','we','hi','is',"'s",'?',',',':','..','.','|','#','-','<','>','(',')','{','}']
# nltk.download() # To make sure all ntlk site packages are upto date and installed to get started with nltk from nltk import PorterStemmer from nltk import WordNetLemmatizer paragraph = """Thank you all so very much. Thank you to the Academy. Thank you to all of you in this room. I have to congratulate the other incredible nominees this year. The Revenant was the product of the tireless efforts of an unbelievable cast and crew. First off, to my brother in this endeavor, Mr. Tom Hardy. Tom, your talent on screen can only be surpassed by your friendship off screen … thank you for creating a transcendent cinematic experience. Thank you to everybody at Fox and New Regency … my entire team. I have to thank everyone from the very onset of my career … To my parents; none of this would be possible without you. And to my friends, I love you dearly; you know who you are. And lastly, I just want to say this: Making The Revenant was about man's relationship to the natural world. A world that we collectively felt in 2015 as the hottest year in recorded history. Our production needed to move to the southern tip of this planet just to be able to find snow. Climate change is real, it is happening right now. It is the most urgent threat facing our entire species, and we need to work collectively together and stop procrastinating. We need to support leaders around the world who do not speak for the big polluters, but who speak for all of humanity, for the indigenous people of the world, for the billions and billions of underprivileged people out there who would be most affected by this. For our children’s children, and for those people out there whose voices have been drowned out by the politics of greed. I thank you all for this amazing award tonight. Let us not take this planet for granted. I do not take tonight for granted. Thank you so very much.""" ## Tokenizing sentences sentences = nltk.sent_tokenize(paragraph) # print(sentences) ## Tokenizing words # wordz = nltk.word_tokenize(paragraph) # print(wordz) # stemmer = PorterStemmer() # Creating an object of PorterStemmer class lemmatizer = WordNetLemmatizer() # Creating an object of PorterStemmer class ## Stemming # for i in range(len(sentences)): # words = nltk.word_tokenize(sentences[i]) # Word Tokenization on sentences list. # stemmed_words = [stemmer.stem(word) for word in words] #List Comprehension usage and stemming each word of a single sentence at a time. # sentences[i] = ' '.join(stemmed_words) # Joining all stemmed words back into sentences using space delimiter and join function # print(sentences) ## Lemmatization for j in range(len(sentences)): words = nltk.word_tokenize( sentences[j]) # Word Tokenization on sentences list. lemmatized_words = [ lemmatizer.lemmatize(word) for word in words
def __init__(self, stopwords=None, punct=None, lower=True, strip=True): self.lower = lower self.strip = strip self.stopwords = stopwords or set(sw.words('english')) self.punct = punct or set(string.punctuation) self.lemmatizer = WordNetLemmatizer()
import os
def universal_check(words, tag, universal_tag, not_arranged_universal_tag): value = ASL_Structure_DB.get_one_where(universal_tag)['ASL'] gloss = "" supergloss = [] [second_value] = [value.split()] for tag2 in second_value: for value, tag1, tag3, tag4 in zip(words, tag, universal_tag, not_arranged_universal_tag): # if value == "a": # continue if value.lower() == "this": value = "ix-that" if tag2 == tag3: lamentizer = WordNetLemmatizer() lemmatized_tokens = lamentizer.lemmatize(value.lower(), pos='v') if lemmatized_tokens.lower() == "i": lemmatized_tokens = "me" elif lemmatized_tokens.lower() == "n't": lemmatized_tokens = "not" elif lemmatized_tokens.lower() == "'s": lemmatized_tokens = "is" if value.lower() == "bit": lemmatized_tokens = value supergloss.append((lemmatized_tokens, tag4)) gloss += lemmatized_tokens + "," gloss = gloss.replace(",", " ").upper() gloss = gloss[:-1] value = [] new_new_value = [] rules = { '1': 'one', '2': 'two', '3': 'three', '4': 'four', '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine' } change = [] cou = 0 for word, tag in supergloss: if word != ".": new_value = Database.get_one_where(word.capitalize(), tag) if new_value is None: if cou < len(gloss.split()): change.append(cou) if word.isdigit(): new_value = "None" for letter in word: new_letter = rules[letter] for letter in new_letter: new_value = Database.get_one_where( letter.capitalize(), '.') value.append(new_value) else: for letter in word: # print(letter) if letter != "'": new_value = Database.get_one_where( letter.capitalize(), '.') value.append(new_value) new_new_value.append(value) value = [] else: new_new_value.append(new_value) cou = cou + 1 new_new_new_value = [] for i in enumerate(new_new_value): a = 0 if isinstance(new_new_value[i[0]], str): new_new_new_value.append(new_new_value[i[0]]) else: while a < len(new_new_value[i[0]]): new_new_new_value.append(new_new_value[i[0]][a]) a += 1 value = " ".join(new_new_new_value) gloss = gloss.split() for y in change: # print(y) word1 = "" for i, word in enumerate(gloss): if i == y: if word[:1] == "'": word = word[1:] for i in word: word1 += i + "-" # print(word1[:-1]) gloss[y] = word1[:-1] gloss = " ".join(gloss) # gloss = str(gloss).replace(",", " ").upper() return {'value': value, 'gloss': gloss}
def lemmatizing(self, text): wl = WordNetLemmatizer() return [wl.lemmatize(word) for word in text]
def __init__(self): self.stopwords = set(stopwords.words('english')) self.stemmer = PorterStemmer() self.lemmatizer = WordNetLemmatizer() self.sia = SentimentIntensityAnalyzer()
def tokenizer(data: DataFrame, rows, columns): tokenDict = dict() #"<entry>": (tf(overall), df, [list of docs it appears in]) tokenDocs = dict() tokPostings = dict() #"<entry>": {docid: [tf in that doc, max_tf, doclen], ...} docInfo = dict() lematizer = WordNetLemmatizer() stopWords = set(stopwords.words("english")) for i in range(0, rows): tf = 1 max_tf = 1 doclen = 0 docNo = i tokens1 = word_tokenize(data["Title"][i]) tokens = list() #print(data["Text"][i]) sentenceList = sent_tokenize(data["Text"][i]) for sentence in sentenceList: tmp = word_tokenize(sentence) for t in tmp: tokens.append(t) #tokens = word_tokenize(sent_tokenize(data["Text"])) for t in tokens1: tokens.append(t) for tok in tokens: doclen += 1 if tok in stopWords: continue word = lematizer.lemmatize(tok) if word in tokenDict: tokenDict[word] = tokenDict.get(word) + 1 tokenDocs[word].add(docNo) # tokPostings[word]. else: tokenDict[word] = 1 tokenDocs[word] = {docNo} # tokPostings[word] = {docNo:1} if word in tokPostings: if docNo in tokPostings[word].keys(): tokPostings[word][docNo][0] = tokPostings[word][docNo][0] + 1 tf = tokPostings[word][docNo][0] if tf > max_tf: max_tf = tf else: tokPostings[word][docNo] = [1, 0, 0] else: tokPostings[word] = {docNo: [1, 0, 0]} # {docid: (tf,max_tf, doclen)} docInfo[docNo] = [max_tf, doclen] for word in tokPostings.keys(): for doc in tokPostings[word]: tokPostings[word][int(doc)][1] = docInfo[int(doc)][0] tokPostings[word][int(doc)][2] = docInfo[int(doc)][1] sumOfDoclens = 0 for doc in docInfo: sumOfDoclens += docInfo[doc][1] avgDoclen = sumOfDoclens / rows fullTokenDict = combineDicts(tokenDict, tokenDocs) # combine dictionaries with same key set if fullTokenDict == -1: print("Failed in combining dictionaries") return # else: # print(fullTokenDict) # print(tokenDict) # stemmedTokenDict, stemmedTokenDocs = stemmer(tokenDict) return fullTokenDict, tokPostings, avgDoclen
def __init__(self, stopwords=None): self.stopwords = stopwords or set(sw.words('english')) self.lemmatizer = WordNetLemmatizer() self.word_cache = {}
def build_analyzer(self): lemm = WordNetLemmatizer() analyzer = super(LemmaCountVectorizer, self).build_analyzer() return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc) if (not doc.isdigit()) and len(doc) >= 3)
def preprocess(sentence): lemmatizer = WordNetLemmatizer() return [ lemmatizer.lemmatize(word.lower()) for word in word_tokenize(str(sentence)) ]
def __init__(self): # Lemmatizer for shortening each word to a more-commonly-used form of the word self._lemmatizer = WordNetLemmatizer() # Scraper to get common keywords from response self._keyword_scraper = KeywordScraper # Maximum number of types self._max_types = 3 # Obvious religious keywords. These must be lowercase self._religion_words = [ 'god', 'spiritual', 'religion', 'worship', 'church', 'prayer' ] # Regex for url of government websites self._government_detector = r'^([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[Gg][Oo][Vv](\.[a-zA-Z]{2})?$' # Lowest (highest number) rank a keyword can have and still count towards determining organization type self._max_rank = 40 # Keywords to look for for other types. These must be lowercase self._type_words = { OrgTypesEnum.EDUCATION: [ 'education', 'school', 'study', 'teach', ], OrgTypesEnum.ADVOCACY: [ 'advocacy', 'lobby', 'policy', ], OrgTypesEnum.RESEARCH: [ 'research', 'conduct', 'document', 'identify', 'analyze', 'correlate', 'compile', 'report', 'data', 'publication', 'journal', 'periodical', 'newsletter', ], OrgTypesEnum.PREVENTION: [ 'prevention', 'intervention', 'education', 'development', 'community', 'ownership', 'avoidance', 'blockage', 'determent', 'forestalling', 'halt', 'hindrance', 'impediment', 'inhibitor', 'interception', 'interruption', 'obstacle', 'obstruction', 'prohibition', 'stoppage', 'thwarting', 'deterence', ], OrgTypesEnum.PROTECTION: [ 'protection', 'rescue', 'rehabilitation', 'reintegration', 'repatriation', 'empowerment', 'repatriation', 'fulfilment', 'freedom', 'opportunity', 'women', 'conservation', 'insurance', 'preservation', 'safeguard', 'safety', 'security', 'shelter', 'stability', 'assurance', 'barrier', 'cover', 'custody', 'defense', 'fix', 'guard', 'invulnerability', 'reassurance', 'refuge', 'safekeeping', 'salvation', 'screen', 'self-defense', 'shield', 'strength', 'surety', 'guarding', ], OrgTypesEnum.PROSECUTION: [ 'prosecution', 'compliance', 'abolish', 'law', 'enforcement', 'regulatory', 'regulation', 'justice', 'case', 'cause', 'claim', 'lawsuit', 'litigation', 'proceeding', 'suit', ], } # Stem search words (religious, general) self._religion_words = [ self._lemmatizer.lemmatize(word) for word in self._religion_words ] for key in self._type_words.iterkeys(): self._type_words[key] = [ self._lemmatizer.lemmatize(word) for word in self._type_words[key] ]
or 'Negative' sentiment but a continuous rating. """ import nltk as nltk from nltk.corpus import stopwords from string import punctuation from nltk import PorterStemmer from nltk import word_tokenize, WordNetLemmatizer from collections import Counter from nltk import NaiveBayesClassifier, classify nltk.download('stopwords') nltk.download('wordnet') nltk.download('punkt') stop = set(stopwords.words('english')) stemmer = PorterStemmer() wnl = WordNetLemmatizer() def perform_nlp(text): """ remove English stopwords removes punctuations lemmatizes words stems words """ text = ''.join(c for c in text if c not in punctuation) # tokenize the sentence text = word_tokenize(text)
def get_processed_posting_list_operations(query_words_deque: deque, operations: deque) -> dict: left_word_query: str = query_words_deque.popleft() # get first input word left_word_query = WordNetLemmatizer().lemmatize(left_word_query) left_dict_post_list: dict = get_posting_list_for_token(left_word_query) multiple_postings: list = list() flag_next: bool = False while True: try: right_word_query: str = query_words_deque.popleft( ) # get next word right_word_query = WordNetLemmatizer().lemmatize(right_word_query) right_dict_post_list: dict = get_posting_list_for_token( right_word_query) curr_operation: str = operations.popleft( ) # current operation match keywords if curr_operation != "and": # and only process by multiple if multiple_postings: # if reached end of and list (next operation for exp: or) left_dict_post_list = intersect_many_posting_lists( multiple_postings) multiple_postings = list() # clean for future if curr_operation == "or": left_dict_post_list = union_posting_lists( left_dict_post_list, right_dict_post_list) # update left postings with # with intersect value elif curr_operation == "ornot": left_dict_post_list = union_posting_lists( left_dict_post_list, not_postings_list(right_dict_post_list)) elif curr_operation == "andnot": if left_dict_post_list and right_dict_post_list: left_dict_post_list = subtract_from_left_right_posting_lists( left_dict_post_list, right_dict_post_list) else: if not flag_next: # there are no more combined with and tokens if multiple_postings: multiple_postings.append( right_dict_post_list ) # store while not reached end else: multiple_postings = [ left_dict_post_list, right_dict_post_list ] # initialization if not (left_dict_post_list or right_dict_post_list): # only empty flag_next = True multiple_postings = list() except IndexError: # print("End of query") # if curr_operation == "not": # empty right already checked # left_dict_post_list = not_postings_list(left_dict_post_list) # update left postings # with intersect value if multiple_postings: left_dict_post_list = intersect_many_posting_lists( multiple_postings) # all clearly and processed here break return left_dict_post_list