def porter_stemming_retrieval_cos(query): global stemmed_vocab vocab = file_handler.open_json_file("library/vocab1.txt") expended_query = [] for word in query: i = query.index(word) word = re.sub(r'[^\w*]', '', word) word = word.lower() query[i] = word stemmed_query = stemmer.porter_word_stemmer(query) stemmed_vocab = file_handler.open_json_file("library/porter_vocab.txt") for word in stemmed_query: if word in stemmed_vocab: old_ID_list = stemmed_vocab[word] for id in old_ID_list: expended_query.append(vocab[id]) answer, scores = cos_retriever_title(expended_query) return answer, scores
def main(): porter = file_handler.open_json_file("library/porter_vocab.txt") lemma = file_handler.open_json_file("library/lemma_vocab.txt") vocab = file_handler.open_json_file("library/vocab1.txt") print(len(porter)) print(len(lemma)) print(len(vocab))
def cosineScore_title_weighing_score_return(query): global doclength global postings docTitles = file_handler.open_json_file("library/doctitles1.txt") scores = {} no_of_docs = len(doclength) for term in query: term = term.lower() term = re.sub(r'[^\w*]', '', term) try: termid = vocab.index(term) except: print("\"",term, "\"is", 'is not in dictionary') continue no_docs_with_term = len(postings[str(termid)]) idf = calculate_idf(no_of_docs, no_docs_with_term) posting_list = postings[str(termid)] for doc_id in posting_list: tf = calculate_tf(posting_list[doc_id], doclength[doc_id]) idf_tf_weight = tf*idf update_score(scores, doc_id, idf_tf_weight) score_title(termid, doc_id, docTitles, scores) answer = list(sorted(scores, key=lambda x: scores[x], reverse = True)) return answer, scores
def lemmatize_retrieval(query): global lemma_vocab postings = file_handler.open_json_file("library/postings1.txt") final_list = [] lemma_query = stemmer.word_lemmatizer(query) lemma_vocab = file_handler.open_json_file("library/lemma_vocab.txt") for word in lemma_query: word = word.lower() word = re.sub(r'[^\w*]', '', word) if word in lemma_vocab: old_ID_list = lemma_vocab[word] for id in old_ID_list: list_of_ID = list(postings[str(id)].keys()) for id in list_of_ID: final_list.append(id) return final_list
def porter_stemming_retrieval(query): global stemmed_vocab postings = file_handler.open_json_file("library/postings1.txt") final_list = [] stemmed_query = stemmer.porter_word_stemmer(query) stemmed_vocab = file_handler.open_json_file("library/porter_vocab.txt") for word in stemmed_query: word = re.sub(r'[^\w*]', '', word) word = word.lower() if word in stemmed_vocab: old_ID_list = stemmed_vocab[word] for id in old_ID_list: list_of_ID = list(postings[str(id)].keys()) for id in list_of_ID: final_list.append(id) return final_list
def vocab_lemmatize(split_text): lemmatized_vocab = {} lemmatizer = nltk.WordNetLemmatizer() vocab = file_handler.open_json_file("library/vocab1.txt") for word in split_text: oldId = vocab.index(word) lemmatized_word = lemmatizer.lemmatize(word) if lemmatized_word in lemmatized_vocab: lemmatized_vocab[lemmatized_word].append(oldId) else: lemmatized_vocab.update({lemmatized_word: [oldId]}) return lemmatized_vocab
def vocab_stemmer_porter(split_text): porter_stemmed_vocab = {} porter_stemmer = PorterStemmer() vocab = file_handler.open_json_file("library/vocab1.txt") for word in split_text: oldId = vocab.index(word) porter_stemmed_word = porter_stemmer.stem(word) if porter_stemmed_word in porter_stemmed_vocab: porter_stemmed_vocab[porter_stemmed_word].append(oldId) else: porter_stemmed_vocab.update({porter_stemmed_word: [oldId]}) return porter_stemmed_vocab
def get_url(id): docids =file_handler.open_json_file("library/docids1.txt") return docids[int(id)]