def retrive_documents(self,query_id): k1 = 1.2 k3 = 8.00 avg_dl = 122 b = 1 # from 0.25 to 2.00 increase 0.25 q = Query(query_id) #q.set_concepts(self.QueryConceptExtraction(q.text)) self._expand_query(q) return print "Retrieving Documents for: ", q.text Collection._load() Collection._load_go() Collection._load_tags() Collection._load_indexes() #Loads documents into _documents with PMID and Index score = dict() N = Collection._count Nt = dict() for term in q.text: Nt[term] = Collection._get_frequency(term) counter = 0 for doc in Collection._documents: summation = 0; dl = doc.length * 1.00 for t in q.text: tfn = doc.get_frequency(t) QQ = ' '.join(q.text) qtf = Document._term_frequency(QQ, t) K = k1*((1-b)+b*(dl/avg_dl)) w = log((N-Nt[t]+0.5)/(Nt[t]+0.5),2) if w<0: #this makes the result a negative number # if we break the result will be bigger than or equal to zero break p1 = (((k1+1)*tfn)/(K+tfn)) p2 = ((k3+1)*qtf/(k3+qtf)) p3 = w summation += p1*p2*p3 score[doc.PMID] = summation counter += 1
def Indexing(self): ''' IR Indexing Operations - Elimination of Stopwords - ''' DB._execute("DELETE from collection_index") print "Indexing is started..." tp = TextProcessor() Collection._load() Collection._load_tags() #loading document with PMID, tags and abstracts for doc in Collection._documents: index_list = [] for term in doc.abstract: index_list.append(term) if GlobalVariables.global_context_activated: for term in doc.tag: index_list.append(term) index_list = tp.EliminateStopWords(index_list) index_list = tp.Stem(index_list) doc.set_index(index_list) print "Indexing is Done!"