Exemplo n.º 1
0
def porter_stemming_retrieval_cos(query):
	global stemmed_vocab
	vocab = file_handler.open_json_file("library/vocab1.txt")
	
	
	expended_query = []
	for word in query:
		i = query.index(word)
		word = re.sub(r'[^\w*]', '', word)
		word = word.lower()
		query[i] = word
	
	stemmed_query = stemmer.porter_word_stemmer(query)


	stemmed_vocab = file_handler.open_json_file("library/porter_vocab.txt")

	for word in stemmed_query:
		
		if word in stemmed_vocab:
			old_ID_list = stemmed_vocab[word]
			for id in old_ID_list:
				expended_query.append(vocab[id])
				
	answer, scores = cos_retriever_title(expended_query)
	return answer, scores
Exemplo n.º 2
0
def main():

    porter = file_handler.open_json_file("library/porter_vocab.txt")
    lemma = file_handler.open_json_file("library/lemma_vocab.txt")
    vocab = file_handler.open_json_file("library/vocab1.txt")

    print(len(porter))
    print(len(lemma))
    print(len(vocab))
Exemplo n.º 3
0
def cosineScore_title_weighing_score_return(query):
	global doclength
	global postings
	docTitles = file_handler.open_json_file("library/doctitles1.txt")
	scores = {}
	
	no_of_docs = len(doclength)
	
	for term in query:
		term = term.lower()
		term = re.sub(r'[^\w*]', '', term)
		try:
			termid = vocab.index(term)
		except:
			print("\"",term, "\"is", 'is not in dictionary')
			continue
		no_docs_with_term = len(postings[str(termid)])
		idf = calculate_idf(no_of_docs, no_docs_with_term)
		posting_list = postings[str(termid)]
		for doc_id in posting_list:
			tf = calculate_tf(posting_list[doc_id], doclength[doc_id])
			idf_tf_weight = tf*idf
			update_score(scores, doc_id, idf_tf_weight)
			score_title(termid, doc_id, docTitles, scores)
	answer = list(sorted(scores, key=lambda x: scores[x], reverse = True))

	return answer, scores
Exemplo n.º 4
0
def lemmatize_retrieval(query):
	global lemma_vocab
	postings = file_handler.open_json_file("library/postings1.txt")
	final_list = []
	lemma_query = stemmer.word_lemmatizer(query)
	lemma_vocab = file_handler.open_json_file("library/lemma_vocab.txt")

	for word in lemma_query:
		word = word.lower()
		word = re.sub(r'[^\w*]', '', word)
		if word in lemma_vocab:
			old_ID_list = lemma_vocab[word]
			for id in old_ID_list:
				list_of_ID = list(postings[str(id)].keys())
				for id in list_of_ID:
					final_list.append(id)
	
	return final_list
Exemplo n.º 5
0
def porter_stemming_retrieval(query):
	global stemmed_vocab
	postings = file_handler.open_json_file("library/postings1.txt")
	final_list = []
	stemmed_query = stemmer.porter_word_stemmer(query)
	
	stemmed_vocab = file_handler.open_json_file("library/porter_vocab.txt")

	for word in stemmed_query:
		word = re.sub(r'[^\w*]', '', word)
		word = word.lower()
		if word in stemmed_vocab:
			old_ID_list = stemmed_vocab[word]
			for id in old_ID_list:
				list_of_ID = list(postings[str(id)].keys())
				for id in list_of_ID:
					final_list.append(id)
	
	return final_list 
Exemplo n.º 6
0
def vocab_lemmatize(split_text):
    lemmatized_vocab = {}

    lemmatizer = nltk.WordNetLemmatizer()

    vocab = file_handler.open_json_file("library/vocab1.txt")

    for word in split_text:

        oldId = vocab.index(word)
        lemmatized_word = lemmatizer.lemmatize(word)

        if lemmatized_word in lemmatized_vocab:
            lemmatized_vocab[lemmatized_word].append(oldId)
        else:
            lemmatized_vocab.update({lemmatized_word: [oldId]})

    return lemmatized_vocab
Exemplo n.º 7
0
def vocab_stemmer_porter(split_text):
    porter_stemmed_vocab = {}

    porter_stemmer = PorterStemmer()

    vocab = file_handler.open_json_file("library/vocab1.txt")

    for word in split_text:

        oldId = vocab.index(word)
        porter_stemmed_word = porter_stemmer.stem(word)

        if porter_stemmed_word in porter_stemmed_vocab:
            porter_stemmed_vocab[porter_stemmed_word].append(oldId)
        else:
            porter_stemmed_vocab.update({porter_stemmed_word: [oldId]})

    return porter_stemmed_vocab
Exemplo n.º 8
0
def get_url(id):
	docids =file_handler.open_json_file("library/docids1.txt")
	return docids[int(id)]