Пример #1
0
 def retrive_documents(self,query_id):
     k1 = 1.2
     k3 = 8.00
     avg_dl = 122
     b = 1 # from 0.25 to 2.00 increase 0.25
     q = Query(query_id)
     #q.set_concepts(self.QueryConceptExtraction(q.text))
     self._expand_query(q)
     return
     print "Retrieving Documents for: ", q.text
     Collection._load()
     Collection._load_go()
     Collection._load_tags()
     Collection._load_indexes()      #Loads documents into _documents with PMID and Index
     score = dict()
     N = Collection._count
     Nt = dict()
     for term in q.text:
         Nt[term] = Collection._get_frequency(term)
     counter = 0
     for doc in Collection._documents:
         summation = 0;
         dl = doc.length * 1.00
         for t in q.text:
             tfn = doc.get_frequency(t)
             QQ = ' '.join(q.text)
             qtf = Document._term_frequency(QQ, t)
             K = k1*((1-b)+b*(dl/avg_dl))
             w = log((N-Nt[t]+0.5)/(Nt[t]+0.5),2)
             if w<0:
                 #this makes the result a negative number
                 # if we break the result will be bigger than or equal to zero
                 break
             p1 = (((k1+1)*tfn)/(K+tfn))
             p2 = ((k3+1)*qtf/(k3+qtf))
             p3 = w
             summation += p1*p2*p3
         score[doc.PMID] = summation
         counter += 1
Пример #2
0
 def DocumentExpantion(self):
     '''
     db.Query("delete from collection_concepts;")!!!
     
     BM25TermWeightingModel
     BM25 or Best Match algorithm, calculates the weight of 
     each word in each extracted concept for the document 
     '''
     print "Calculating weights is started..."
     wieght_threshold = 0.10
     tp = TextProcessor()
     ontology = Ontology()
     db = DB()
     db.Query("delete from collection_concepts;")
     Collection._load()
     Collection._load_go()
     N = Collection._count
     #Terminologies are ('go','mesh','icd10','snomed') corresponding with columns 2,3,4,5
     T = ontology.GetDict('go')    #bring all ontologies into the memory to be faster!
     doc_avg_len = 122
     k1 = 1.2 
     b = 1.00
     doc_counter = 0   
     print Collection._count                     # tuning parameters!
     for d in Collection._documents:
         doc_counter += 1
         doc_len = d.length
         weight = dict()
         for C in d.go:
             C = C.replace(' ','')
             # Extract concept variants for C
             var = ' '
             for variant in T[C]:
                 var += ' {0} '.format(variant)
             terms = set( var.split(tp.WordSplitter()))
             tp.remove_values_from_list(terms,'')
             l = len(terms)    
             sumation = 0  
             for term in terms:
                 term_weight = 0
                 #calculate the weight
                 tf = d.get_frequency(term)                     
                 #Here goes calculating the weight
                 n_k = Collection._get_frequency(term)
                 tf = d.get_frequency(term)
                 try:
                     term_weight = tf * (( log10((N-n_k+0.50)/(n_k+0.50)) )/(k1+((1-b)+b) * (doc_len/doc_avg_len)+(tf))) 
                 except:
                     pass
                     #print "One here!++++++++++++++++++++++++++++++++++"
                 sumation += term_weight
             if (sumation/l) > wieght_threshold:
                 weight[C] = (1.00/l) * sumation
         # Store concepts and weights in the database, concepts and their weights are semi-colon separated
         values = ''
         ConceptList = []
         for row in weight:
             row = row.replace(" ",'')
             for term in T[row]:
                 ConceptList.append(term)
             if values == '':
                 values = str(row) + ';' + str(weight[row])
             else:
                 values += ',' + str(row) + ';' + str(weight[row])
         d.set_tag(ConceptList)   #Adding tag tags to documents
         query = 'Insert into collection_concepts (PMID, Concepts) values({0}, "{1}")'.format(d.PMID,values)
         #print query
         db.Query(query)
     print "Calculating weights is Done! Concepts are added to Database"