def __init__(self, corpus_path=None, corpus_frac=None, max_df=0.65, min_word_len=3, max_tfidf_features=10000, n_svd_components=100, label_col=None, id_col='id', append_pos_tags=False): self.max_df = max_df self.min_word_len = min_word_len self.max_tfidf_features = max_tfidf_features self.n_svd_components = n_svd_components self.append_pos_tags = append_pos_tags self.stemmer = PorterStemmer() self.lemmatizer = WordNetLemmatizer() self.stopwords = [] with (DATA_DIR / 'misc' / 'stopwords.txt').open('r') as fd: self.stopwords = [line.strip() for line in fd] self.regex = { 'number': re.compile(r'[0-9]+?'), 'web_email': re.compile( r'((www.+?\s)|(http.+?\s)|([a-z]+?\@.+?s)|(\.[a-z]{2,3}))'), 'spacer': re.compile(r'[\_\-]'), 'punct': re.compile( r'[\[\]\'\.,\/\#\!\?\$\%\^\&\*;\:{}=\_`~\(\)\n\r�\<\>\@\\]+?') } self.tokenizer = str.split self.corpus = self.load_corpus(corpus_path, label_col=label_col, corpus_frac=corpus_frac) self.training_corpus, self.testing_corpus = self.split_corpus() self.vectorizer_params = { # 'lowercase': True, # Covered by preprocessor # 'stop_words': self.stopwords, # Covered by preprocessor 'analyzer': 'word', 'preprocessor': self.preprocess, 'tokenizer': self.tokenizer, 'max_df': self.max_df, 'max_features': self.max_tfidf_features, } self.svd_params = {'n_components': self.n_svd_components, 'n_iter': 5} self.count_vectorizer = None self.tfidf_transformer = TfidfTransformer() self.tfidf_vectorizer = None self.svd = None self.lsa = None self.set_vectorizers() self.set_svd()
import nltk import pandas as pd import numpy as np import pickle import re from nltk.corpus import stopwords from nltk import PorterStemmer, WordNetLemmatizer data = pd.read_csv('spam.csv', sep=',', encoding='latin-1') data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True) data['Type'] = data['Type'].map({'ham': 0, 'spam': 1}) X = data['Message'] y = data['Type'] stem = PorterStemmer() corpus = [] for i in range(len(data)): words = re.sub('[^a-zA-Z]', ' ', data['Message'][i]) words = words.lower() words = words.split() words = [ stem.stem(word) for word in words if word not in set(stopwords.words('english')) ] words = ' '.join(words) corpus.append(words) #creating BagOfWords from sklearn.feature_extraction.text import CountVectorizer
best_seller_group = shampoo.groupby('best_selling', ) best_seller_group.agg(['mean', 'std', 'median']) rating_mask = shampoo['rating'].isnull() == False rating_group = shampoo.loc[rating_mask, :] rating_group['rating'] = rating_group['rating'].astype('float') rating_grouped = rating_group.groupby('best_selling', ) rating_grouped.agg(['mean', 'std', 'median']) # Natural Language Processing from nltk.corpus import stopwords stop = stopwords.words('english') from textblob import TextBlob from nltk import PorterStemmer stemmer = PorterStemmer() import nltk df = pd.read_csv('description_df') df['nlp_description'] = df['nlp_description'].astype('string') # add product specific stop words stop.extend([ 'shampoo', 'conditioner', 'soap', 'cleanse', 'hair', 'head', 'shoulders', 'loréal', 'pari', 'product', 'help', 'use', 'free', 'make', 'type' ]) #Pre Processing #remove stop words df['nlp_description'] = df['nlp_description'].apply( lambda text: " ".join(word for word in text.split() if word not in stop))
def stem_words(f): stemmer=PorterStemmer() processed=tokenize(f) for i in range(len(processed)): processed[i]=stemmer.stem(processed[i]) return processed
def stem_it(self): stemmer = PorterStemmer() self.word = stemmer.stem(self.word)
def tfidf_classifier(fname): with open(fname + ".txt", "r") as file: paragraph = file.read() #clean the extracted content paragraph = " ".join(re.findall(r"\b[a-z0-9]+\b", paragraph, flags=re.I)).lower() #get the part of speech for every word in the content pos_tag_words = pos_tag(paragraph.split()) porter_stemmer_obj = PorterStemmer() stem = porter_stemmer_obj.stem pos_tag_words = [(str(stem(tag[0])), tag[-1]) if tag[-1].startswith("VB") else tag for tag in pos_tag_words] paragraph = " ".join([w[0] for w in pos_tag_words]) #extract all the nouns, adjectives, adverbs and verbs from the paragraph temp_noun_adj_list = [] temp_verb_adv_list = [] all_words = [] all_words_count_dict = {} for pos_words in pos_tag_words: if (pos_words[-1].startswith("NN") or pos_words[-1].startswith("JJ")): temp_noun_adj_list.append(pos_words[0]) if len(temp_verb_adv_list) > 1: adv_verb_str = " ".join(temp_verb_adv_list) if adv_verb_str not in all_words_count_dict: all_words_count_dict[adv_verb_str] = paragraph.count( adv_verb_str) temp_verb_adv_list = [] elif temp_verb_adv_list: if temp_verb_adv_list[0] not in all_words_count_dict: all_words_count_dict[ temp_verb_adv_list[0]] = paragraph.count( temp_verb_adv_list[0]) temp_verb_adv_list = [] elif pos_words[-1].startswith("VB"): temp_verb_adv_list.append(pos_words[0]) if len(temp_noun_adj_list) > 1: adj_noun_str = " ".join(temp_noun_adj_list) if adj_noun_str not in all_words_count_dict: all_words_count_dict[adj_noun_str] = paragraph.count( adj_noun_str) temp_noun_adj_list = [] elif temp_noun_adj_list: if temp_noun_adj_list[0] not in all_words_count_dict: all_words_count_dict[ temp_noun_adj_list[0]] = paragraph.count( temp_noun_adj_list[0]) temp_noun_adj_list = [] elif pos_words[-1].startswith("RB"): temp_verb_adv_list.append(pos_words[0]) if len(temp_noun_adj_list) > 1: adj_noun_str = " ".join(temp_noun_adj_list) if adj_noun_str not in all_words_count_dict: all_words_count_dict[adj_noun_str] = paragraph.count( adj_noun_str) temp_noun_adj_list = [] elif temp_noun_adj_list: if temp_noun_adj_list[0] not in all_words_count_dict: all_words_count_dict[ temp_noun_adj_list[0]] = paragraph.count( temp_noun_adj_list[0]) temp_noun_adj_list = [] else: if temp_noun_adj_list: adj_noun_str = " ".join(temp_noun_adj_list) if adj_noun_str not in all_words_count_dict: all_words_count_dict[adj_noun_str] = paragraph.count( adj_noun_str) temp_noun_adj_list = [] if temp_verb_adv_list: adv_str = " ".join(temp_verb_adv_list) if adv_str not in all_words_count_dict: all_words_count_dict[adv_str] = paragraph.count(adv_str) temp_verb_adv_list = [] if len(temp_noun_adj_list) > 0: adj_noun_str = " ".join(temp_noun_adj_list) if adj_noun_str not in all_words_count_dict: all_words_count_dict[adj_noun_str] = paragraph.count(adj_noun_str) if len(temp_verb_adv_list) > 0: adv_str = " ".join(temp_verb_adv_list) if adv_str not in all_words_count_dict: all_words_count_dict[adv_str] = paragraph.count(adv_str) with open(fname + ".json", "w") as file: json.dump(all_words_count_dict, file)
def feature_maker(embed_file, dataframe, embed_signal='n'): '''takes a path to embeddings file, dataframe as input - default keyword embed-signal means that embeddings are not encoded by default returns an expanded dataframe with: a column of lemmatised words; a column of stemmed words; a column indicating capitalisation status; a column indicating capilatisation status of previous token; columns indicating shape, previous shape, short shape, previous short shape, following token short shape. If kwarg embed_signal is 'y', a list of embeddings is also generated. ''' wnl = WordNetLemmatizer() prtr = PorterStemmer() stringed_list = [str(x) for x in dataframe['token']] wn_lemma_list = [wnl.lemmatize(t) for t in stringed_list] dataframe['lemma'] = wn_lemma_list prtr_stemmer_list = [prtr.stem(t) for t in stringed_list] dataframe['stem'] = prtr_stemmer_list dataframe['caps'] = 'no caps' dataframe.loc[dataframe['token'].str.contains('^[A-Z][a-z]'), ['caps']] = 'begin_cap' dataframe.loc[dataframe['token'].str.contains('[A-Z][A-Z]'), ['caps']] = 'all_caps' dataframe.loc[dataframe['token'].str.contains('[a-z][A-Z]]'), ['caps']] = 'caps_inside' temp_list = dataframe['caps'].to_list() temp_list.insert(0, 'no_cap') temp_list.pop() dataframe['prev_caps'] = temp_list dataframe['short_shape'] = 'x' dataframe.loc[dataframe['token'].str.contains('^[A-Z][a-z]'), ['short_shape']] = 'Xx' dataframe.loc[dataframe['token'].str.contains('[A-Z][A-Z]'), ['short_shape']] = 'XX' dataframe.loc[dataframe['token'].str.contains('[a-z][A-Z]]'), ['short_shape']] = 'xXx' dataframe.loc[dataframe['token'].str.contains('\W'), ['short_shape']] = '-' prev_short_shape_list = [] prev_short_shape_list = dataframe['short_shape'].to_list() prev_short_shape_list.insert(0, '-') prev_short_shape_list.pop() dataframe['prev_short_shape'] = prev_short_shape_list next_short_shape_list = [] next_short_shape_list = dataframe['short_shape'].to_list() next_short_shape_list.pop(0) next_short_shape_list.append('-') dataframe['next_short_shape'] = next_short_shape_list shape_list = [] pre_list = [] suf_list = [] for text in dataframe['token']: prefix = text[:3] suffix = text[-3:] pre_list.append(prefix) suf_list.append(suffix) replace_caps = re.sub('[A-Z]', 'X', text) replace_lowers = re.sub('[a-z]', 'x', replace_caps) replace_digits = re.sub('\d', 'd', replace_lowers) shape_list.append(replace_digits) dataframe['shape'] = shape_list prev_shape_list = [] prev_shape_list = dataframe['shape'].to_list() prev_shape_list.insert(0, '-') prev_shape_list.pop() dataframe['prev_shape'] = prev_shape_list dataframe['prefix'] = pre_list dataframe['suffix'] = suf_list if embed_signal == 'y': word_embedding_model = gensim.models.KeyedVectors.load_word2vec_format( embed_file, binary=True) embeddings = [] for token in dataframe['token']: if token in word_embedding_model: vector = word_embedding_model[token] else: vector = [0] * 300 embeddings.append(vector) return dataframe, embeddings else: return dataframe
def stem(word): word = PorterStemmer().stem(word) return word
def stemming_Porter(tokens): Stemmer = PorterStemmer() return [ Token(Stemmer.stem(word.token), word.pos, forceToken=True) for word in tokens ]
def stem(word_list): return map(lambda x: PorterStemmer().stem(x), word_list)
def stemming(word): word = PorterStemmer().stem_word(word.lower()) return word
listKata = [] for genre,kata in lyricsData: if(genre == listGenre[choice-1]): for word in word_tokenize(kata): valid = True for w in word: if(w in string.punctuation): valid = False word = word.lower() if(word not in stopwords.words("english") and valid): WordNetLemmatizer().lemmatize(PorterStemmer().stem(word),pos='a') listKata.append(word) hasilFreqDist = FreqDist(listKata) print("20 most common words") print("===================") for kata,freq in hasilFreqDist.most_common(20): print(kata," -> ",freq) input("press enter to go back") elif(index == 3): saveFile = open('genre.pickle',"wb") pickle.dump(genreClassifier,saveFile)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """Python for AHDA. Part 5, Example 7. """ # Stemming words - test your tools from nltk import LancasterStemmer from nltk import PorterStemmer print('LancasterStemmer') print(LancasterStemmer().stem('nation')) print(LancasterStemmer().stem('nationality')) print(LancasterStemmer().stem('nationally')) print(LancasterStemmer().stem('natural')) print(LancasterStemmer().stem('naturally')) print(LancasterStemmer().stem('nature')) print() print('PorterStemmer') print(PorterStemmer().stem('nation')) print(PorterStemmer().stem('nationality')) print(PorterStemmer().stem('nationally')) print(PorterStemmer().stem('natural')) print(PorterStemmer().stem('naturally')) print(PorterStemmer().stem('nature'))
detectClasses = u.detectClasses extract_classes = u.extract_classes classifier = MLPClassifier(verbose=True, early_stopping=True, max_iter=10, hidden_layer_sizes=(300, 300), tol=0.000001) # F1=0.50 # classifier = RandomForestClassifier(max_depth=3000, n_jobs=4, n_estimators=20) # F1=0.30 # classifier = ExtraTreeClassifier(max_depth=1000) # F1=0.32 # classifier = GaussianNB() #not working with simultaneous multiclass vectorizer = TFIDFVectorizer(mx_features=None, ngram_range=(1, 2), minDf=10, maxDF=0.98, token_transformer=PorterStemmer().stem) # vectorizer = BagOfWordsVectorizer(mx_features=None, n_gram_range=(1, 2), minDf=10, maxDF=0.98, token_transformer=PorterStemmer().stem) # ------------------- Configuration Section --------------- print("Loading dataset") dataset = pd.read_csv("C:\\tmp\\dabble\\movies_metadata.csv") print("Preprocessing") dataset = pre_process(dataset) # lower case, cleanse, etc. print("Detecting classes") dataset, class_count = detectClasses( dataset, column=CLASS_COLUMN, prefix=CLASS_PREFIX) # generates new columns, one per class
quotes_token = nltk.word_tokenize(qt) quotes_bigrams = list(nltk.bigrams(quotes_token)) print(quotes_bigrams) quotes_trigrams = list(nltk.trigrams(quotes_token)) print(quotes_trigrams) quotes_quadgrams = list(nltk.ngrams(quotes_token, 4)) print(quotes_quadgrams) # stemming from nltk import PorterStemmer pst = PorterStemmer() pst.stem("having") pst.stem("sudeep") words_stem = ["give", "giving", "given", "gave"] for words in words_stem: print(words + " :" + pst.stem(words)) from nltk import LancasterStemmer lnst = LancasterStemmer() for words in words_stem: print(words + " :" + lnst.stem(words)) from nltk import SnowballStemmer
def clean_text(text): tc = TextCleaner(text, PorterStemmer()) return tc.remove_stop_words().remove_punctuation().stem().tokenize()
def try_basic_query_tokenizer(): stemmer = PorterStemmer() x = "answer(cityid('new york', _))" y = basic_query_tokenizer( x, strtok=lambda x: [stemmer.stem(xe) for xe in x.split()])
def process_email(email_contents: str) -> List[int]: """Pre-process the body of an email and return a list of indices of the words contained in the email. :param email_contents: the body of an email :return: a list of indices of the words contained in the email """ # Load the vocabulary. vocabulary_dict = get_vocabulary_dict() # Initialize the return value. word_indices = [] # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with the # full headers # header_token = '\n\n' # header_start = email_contents.find(header_token) # email_contents = email_contents[header_start+len(header_token):] # Convert email content to lower case. email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space email_contents = re.sub('<[^<>]+>', ' ', email_contents) # Handle numbers. # Convert all sequences of digits (0-9) to a 'number' token. email_contents = re.sub('[0-9]+', 'number', email_contents) # Handle URLs. # Convert all strings starting with http:// or https:// to a 'httpaddr' token. email_contents = re.sub('(http://|https://)+\S*', 'httpaddr', email_contents) # Handle email addresses. # Convert all strings with @ in the middle to a 'emailaddr' token. email_contents = re.sub('[\S*]+(@)+\S*', 'emailaddr', email_contents) # Handle $ sign # Convert all sequences of $ signs to a 'dollar' token. email_contents = re.sub('[$]', 'dollar', email_contents) # ========================== Tokenize Email =========================== # Output the email to screen as well print('\n==== Processed Email ====\n\n') # Process file col = 0 # Tokenize and also get rid of any punctuation tokens = re.split('[ @$/#.-:&*\+=\[\]?!\(\)\{\},' '">_<;#\n\r]', email_contents) for token in tokens: # Remove any non alphanumeric characters token = re.sub('[^a-zA-Z0-9]', '', token) # Stem the word token = PorterStemmer().stem(token.strip()) # Skip the word if it is too short if len(token) < 1: continue # Look up the word in the dictionary and add to word_indices if # found for i, word in vocabulary_dict.items(): if token == word: word_indices.append(i) # Print to screen, ensuring that the output lines are not too long if (col + len(token) + 1) > 78: print('') col = 0 print('{} '.format(token), end='', flush=True) col = col + len(token) + 1 # Print footer print('\n\n=========================\n') return word_indices
from nltk.stem.wordnet import WordNetLemmatizer import string import gensim from gensim import corpora # from nltk.tokenize import word_tokenize df = pd.read_json('related_data_rm_duplicacy.json') QATags = df.content # print(QATags) QATags = list(QATags) # print(QATags[:10]) stop = set(stopwords.words('english')) exclude = set(string.punctuation) lemma = WordNetLemmatizer() port = PorterStemmer() def clean(doc): stop_free = " ".join([i for i in doc.lower().split() if i not in stop]) # print(stop_free) punc_free = ''.join(ch for ch in stop_free if ch not in exclude) # print(punc_free) normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split()) stem = " ".join(port.stem(word) for word in normalized.split()) remove_non_english = stem.encode("ascii", errors="ignore").decode() return remove_non_english Text_clean = [clean(doc).split() for doc in QATags]
def stem(self): """ Description: stem tokens with Porter Stemmer. """ self.tokens = np.array([PorterStemmer().stem(t) for t in self.tokens])
def stem(array): stemmer = PorterStemmer() return [stemmer.stem(w) for w in array]
def __init__(self): self.ps = PorterStemmer()
def __init__(self): self.stemmer = PorterStemmer()
def __init__(self): self.speechProcessor = SpeechProcessor() self.stemmer = PorterStemmer() self.propositions = [] self.synsetsList = []
from collections import defaultdict import re import json from nltk import PorterStemmer from nltk.corpus import words import math import string #asfasfasf INDEX_DICT = {} #DOC_ID_DICT = {} directory = "C:\\Users\\tajun\\PycharmProjects\\ICS-121\\DevlopZip\\DEV" doc_counter = 0 partial_counter = 0 NumOfDocs = 0 ps = PorterStemmer() token_count = 0 output_dict = {} #where {filenum;(word,[list of postings]} skip_count = 0 class Postings: #each doc id is a posting? def __init__(self, docid, positions): self.docid = docid self.positions = positions self.tfidf = 0 # use freq counts for now # self.fields = fields #takes in a file name to tokenize and return a list of tokens//should return a list of lists? where first element is tok, second is count, third is and so on.
def run( lr=0.001, batsize=20, epochs=100, embdim=64, encdim=128, numlayers=1, dropout=.25, wreg=1e-10, cuda=False, gpu=0, minfreq=2, gradnorm=3., beamsize=1, cosine_restarts=1., seed=456789, ): # DONE: Porter stemmer # DONE: linear attention # DONE: grad norm # DONE: beam search # DONE: lr scheduler print(locals()) torch.manual_seed(seed) np.random.seed(seed) tt = q.ticktock("script") device = torch.device("cpu") if not cuda else torch.device("cuda", gpu) tt.tick("loading data") stemmer = PorterStemmer() tokenizer = lambda x: [stemmer.stem(xe) for xe in x.split()] ds = GeoQueryDatasetFunQL( sentence_encoder=SequenceEncoder(tokenizer=tokenizer), min_freq=minfreq) train_dl = ds.dataloader("train", batsize=batsize) test_dl = ds.dataloader("test", batsize=batsize) tt.tock("data loaded") do_rare_stats(ds) # batch = next(iter(train_dl)) # print(batch) # print("input graph") # print(batch.batched_states) model = create_model(embdim=embdim, hdim=encdim, dropout=dropout, numlayers=numlayers, sentence_encoder=ds.sentence_encoder, query_encoder=ds.query_encoder, feedatt=True) # model.apply(initializer) tfdecoder = SeqDecoder( model, tf_ratio=1., eval=[ CELoss(ignore_index=0, mode="logprobs"), SeqAccuracies(), TreeAccuracy( tensor2tree=partial(tensor2tree, D=ds.query_encoder.vocab)) ]) losses = make_array_of_metrics("loss", "elem_acc", "seq_acc", "tree_acc") # beamdecoder = BeamActionSeqDecoder(tfdecoder.model, beamsize=beamsize, maxsteps=50) if beamsize == 1: freedecoder = SeqDecoder( model, maxtime=100, tf_ratio=0., eval=[ SeqAccuracies(), TreeAccuracy( tensor2tree=partial(tensor2tree, D=ds.query_encoder.vocab)) ]) vlosses = make_array_of_metrics("seq_acc", "tree_acc") else: print("Doing beam search!") freedecoder = BeamDecoder( model, beamsize=beamsize, maxtime=60, eval=[ SeqAccuracies(), TreeAccuracy( tensor2tree=partial(tensor2tree, D=ds.query_encoder.vocab)) ]) vlosses = make_array_of_metrics("seq_acc", "tree_acc") # # test # tt.tick("doing one epoch") # for batch in iter(train_dl): # batch = batch.to(device) # ttt.tick("start batch") # # with torch.no_grad(): # out = tfdecoder(batch) # ttt.tock("end batch") # tt.tock("done one epoch") # print(out) # sys.exit() # beamdecoder(next(iter(train_dl))) # print(dict(tfdecoder.named_parameters()).keys()) # 4. define optim optim = torch.optim.Adam(tfdecoder.parameters(), lr=lr, weight_decay=wreg) # optim = torch.optim.SGD(tfdecoder.parameters(), lr=lr, weight_decay=wreg) # lr schedule if cosine_restarts >= 0: # t_max = epochs * len(train_dl) t_max = epochs print(f"Total number of updates: {t_max} ({epochs} * {len(train_dl)})") lr_schedule = q.WarmupCosineWithHardRestartsSchedule( optim, 0, t_max, cycles=cosine_restarts) reduce_lr = [lambda: lr_schedule.step()] else: reduce_lr = [] # 6. define training function (using partial) clipgradnorm = lambda: torch.nn.utils.clip_grad_norm_( tfdecoder.parameters(), gradnorm) trainbatch = partial(q.train_batch, on_before_optim_step=[clipgradnorm]) trainepoch = partial(q.train_epoch, model=tfdecoder, dataloader=train_dl, optim=optim, losses=losses, _train_batch=trainbatch, device=device, on_end=reduce_lr) # 7. define validation function (using partial) validepoch = partial(q.test_epoch, model=freedecoder, dataloader=test_dl, losses=vlosses, device=device) # validepoch = partial(q.test_epoch, model=tfdecoder, dataloader=test_dl, losses=vlosses, device=device) # 7. run training tt.tick("training") q.run_training(run_train_epoch=trainepoch, run_valid_epoch=validepoch, max_epochs=epochs) tt.tock("done training")
def s(tokens): return [PorterStemmer().stem(t) for t in tokens]
def __init__(self): super().__init__() self._stemmer = PorterStemmer()
def queryResults(queryString, vocabDict, documents, numberOfRowsForResults): stop_words = set(stopwords.words('english')) scores = {} N = len(documents) queryString = queryString.lower() #queryStringExpansion = queryExpansionMethod(model_glove_twitter,queryString) queryStringExpansion = queryString # create our tokenizer that will also remove punctuation tokenizer = RegexpTokenizer(r'\w+') # removing the I'm , can't to Im and cant queryString = queryString.replace("'", "") # tokenize here queryString = tokenizer.tokenize(queryString) # remove stop words porterStemmer = PorterStemmer() queryString = [ porterStemmer.stem(w) for w in queryString if not w in stop_words ] # we are collecting the weights for the query string and it's length weightsForQuery = {} lengthOfQuery = 0 for stemword in queryString: if stemword.isnumeric(): continue #adding check here so see if the stem word is actually in our vocab. If it's not then we can simply skip it if stemword not in vocabDict: continue # docsFoundForStemWord = vocabDict[stemword] # calculate weight for query word i df_i = vocabDict[stemword][0] tf_iq = queryString.count(stemword) / len(queryString) idf = math.log((N / df_i), 2) w_iq = (0.5 + 0.5 * tf_iq) * idf if stemword not in weightsForQuery: weightsForQuery[stemword] = w_iq lengthOfQuery += w_iq**2 # we now have the length of the query vector and a dict of weights w_iq lengthOfQuery = math.sqrt(lengthOfQuery) # print(weightsForQuery) for word in weightsForQuery: docsFoundForStemWord = vocabDict[word][1] for doc in docsFoundForStemWord: scores[doc] = cosineCalculator(doc, documents, lengthOfQuery, weightsForQuery) arrayOfSortedScoresTuples = sorted(scores.items(), key=lambda x: x[1], reverse=True) #here we add a dictionary that will store the documents and their new scores on the query expansion arrayOfSortedScoresTuplesExpanded = {} for i in range(len(arrayOfSortedScoresTuples)): docId = arrayOfSortedScoresTuples[i][0] originalScore = arrayOfSortedScoresTuples[i][1] docSentence = documents[docId][0] #get sentence #get the tokens in our twitter embedding model tokens_1 = [t for t in docSentence.split() if t in model_glove_twitter] tokens_2 = [ t for t in queryStringExpansion.split() if t in model_glove_twitter ] cosine = 0 if (len(tokens_1) > 0 and len(tokens_2) > 0): cosine = model_glove_twitter.n_similarity(tokens_1, tokens_2) #take the average of both scores! newScoreAvg = (originalScore + cosine) / 2 #store the score with the document arrayOfSortedScoresTuplesExpanded[docId] = newScoreAvg #sort by highest value! arrayOfSortedScoresTuplesExpanded = sorted( arrayOfSortedScoresTuplesExpanded.items(), key=lambda x: x[1], reverse=True) return arrayOfSortedScoresTuplesExpanded[:numberOfRowsForResults]
def stemming_by_portter_1(term): return PorterStemmer().stem(term)