示例#1
0
 def retrive_documents(self,query_id):
     k1 = 1.2
     k3 = 8.00
     avg_dl = 122
     b = 1 # from 0.25 to 2.00 increase 0.25
     q = Query(query_id)
     #q.set_concepts(self.QueryConceptExtraction(q.text))
     self._expand_query(q)
     return
     print "Retrieving Documents for: ", q.text
     Collection._load()
     Collection._load_go()
     Collection._load_tags()
     Collection._load_indexes()      #Loads documents into _documents with PMID and Index
     score = dict()
     N = Collection._count
     Nt = dict()
     for term in q.text:
         Nt[term] = Collection._get_frequency(term)
     counter = 0
     for doc in Collection._documents:
         summation = 0;
         dl = doc.length * 1.00
         for t in q.text:
             tfn = doc.get_frequency(t)
             QQ = ' '.join(q.text)
             qtf = Document._term_frequency(QQ, t)
             K = k1*((1-b)+b*(dl/avg_dl))
             w = log((N-Nt[t]+0.5)/(Nt[t]+0.5),2)
             if w<0:
                 #this makes the result a negative number
                 # if we break the result will be bigger than or equal to zero
                 break
             p1 = (((k1+1)*tfn)/(K+tfn))
             p2 = ((k3+1)*qtf/(k3+qtf))
             p3 = w
             summation += p1*p2*p3
         score[doc.PMID] = summation
         counter += 1
示例#2
0
 def DocumentExpantion(self):
     '''
     db.Query("delete from collection_concepts;")!!!
     
     BM25TermWeightingModel
     BM25 or Best Match algorithm, calculates the weight of 
     each word in each extracted concept for the document 
     '''
     print "Calculating weights is started..."
     wieght_threshold = 0.10
     tp = TextProcessor()
     ontology = Ontology()
     db = DB()
     db.Query("delete from collection_concepts;")
     Collection._load()
     Collection._load_go()
     N = Collection._count
     #Terminologies are ('go','mesh','icd10','snomed') corresponding with columns 2,3,4,5
     T = ontology.GetDict('go')    #bring all ontologies into the memory to be faster!
     doc_avg_len = 122
     k1 = 1.2 
     b = 1.00
     doc_counter = 0   
     print Collection._count                     # tuning parameters!
     for d in Collection._documents:
         doc_counter += 1
         doc_len = d.length
         weight = dict()
         for C in d.go:
             C = C.replace(' ','')
             # Extract concept variants for C
             var = ' '
             for variant in T[C]:
                 var += ' {0} '.format(variant)
             terms = set( var.split(tp.WordSplitter()))
             tp.remove_values_from_list(terms,'')
             l = len(terms)    
             sumation = 0  
             for term in terms:
                 term_weight = 0
                 #calculate the weight
                 tf = d.get_frequency(term)                     
                 #Here goes calculating the weight
                 n_k = Collection._get_frequency(term)
                 tf = d.get_frequency(term)
                 try:
                     term_weight = tf * (( log10((N-n_k+0.50)/(n_k+0.50)) )/(k1+((1-b)+b) * (doc_len/doc_avg_len)+(tf))) 
                 except:
                     pass
                     #print "One here!++++++++++++++++++++++++++++++++++"
                 sumation += term_weight
             if (sumation/l) > wieght_threshold:
                 weight[C] = (1.00/l) * sumation
         # Store concepts and weights in the database, concepts and their weights are semi-colon separated
         values = ''
         ConceptList = []
         for row in weight:
             row = row.replace(" ",'')
             for term in T[row]:
                 ConceptList.append(term)
             if values == '':
                 values = str(row) + ';' + str(weight[row])
             else:
                 values += ',' + str(row) + ';' + str(weight[row])
         d.set_tag(ConceptList)   #Adding tag tags to documents
         query = 'Insert into collection_concepts (PMID, Concepts) values({0}, "{1}")'.format(d.PMID,values)
         #print query
         db.Query(query)
     print "Calculating weights is Done! Concepts are added to Database"
示例#3
0
 def _expand_query(self,q):
     #--STEP 1----------Extract TOP DOCUMENTS ----------------------------
     tp = TextProcessor()
     param = Parameter()
     k1      = 1.2
     k3      = 8.00
     avg_dl  = 122
     b       = 1                     # from 0.25 to 2.00 increase 0.25    
     Collection._load_indexes()      # Loads indexes into _documents
     N = len(Collection._documents)
     score = dict()
     for D in Collection._documents:
         summation = 0;
         dl = D.length * 1.00
         for t in q.text:
             Nt = Collection._get_frequency(t)
             tfn = D.get_frequency(t)
             qtf = q.get_frequency(t)
             K = k1*((1-b)+b*(dl/avg_dl))
             w = log((N-Nt+0.5)/(Nt+0.5),2)
             if w<0:
                 #this makes the result a negative number
                 # if we break the result will be bigger than or equal to zero
                 break
             p1 = (((k1+1)*tfn)/(K+tfn))
             p2 = ((k3+1)*qtf/(k3+qtf))
             p3 = w
             summation += p1*p2*p3
             
         score[D.PMID] = summation
     M = param.GetDocNumberForLocalContext()
     TopDocs = []
     TopNums = []
     new_score = dict()
     for item in score.iterkeys():
         if score[item] > 0:
             new_score[item] = score[item]
     
     for i in range(M):
         TopNums.append(0)
         TopDocs.append('')
     for D in score.iterkeys():
         for i in range(M):
             if score[D] > TopNums[i]:
                 for j in range(M-i-1):
                     TopDocs[M-j-1] = TopDocs[M-j-2]
                     TopNums[M-j-1] = TopNums[M-j-2]
                 TopDocs[i] = D
                 TopNums[i] = score[D]
                 break
     Display._plot(new_score, q)
     TopDocsTexts = ''        
     TopDocsTexts = tp.Tokenize(TopDocsTexts)
     TopDocsTexts = TextProcessor._remove_stop_words(TopDocsTexts)
     #---STEP 2---------Calculate weight of each term which is a member of new query----------------------------
     K = TopDocsTexts
     Beta = 0.4
     weight = dict()
     MaxTFQ = 0.001
     for term in TopDocsTexts:
         tfq = q.get_frequency(term)
         if tfq > MaxTFQ:
             MaxTFQ = tfq
     tfqN = 0
     MaxInfo = 0
     for term in TopDocsTexts:
         Lambda = Document._term_frequency(' '.join(K), term)
         Freq_t_k = Document._term_frequency(' '.join(K), term)
         log1 = log(1.00/(1.00+Lambda),2)
         log2 = log(Lambda/(1.00+Lambda),2)
         InfoBO1 = -log1 - Freq_t_k * log2
         if InfoBO1 > MaxInfo:
             MaxInfo = InfoBO1
     for term in TopDocsTexts:
         Lambda = Document._term_frequency(' '.join(K), term)
         Freq_t_k = Document._term_frequency(' '.join(K), term)
         log1 = log(1.00/(1.00+Lambda),2)
         log2 = log(Lambda/(1.00+Lambda),2)
         InfoBO1 = -log1 - Freq_t_k * log2
         tfq = q.get_frequency(term)
         tfqN = (tfq +0.00) /MaxTFQ
         if MaxInfo >0 :
             weight[term] = tfqN + Beta*(InfoBO1/MaxInfo)
         else:
             weight[term] = 0
     QPrime = []
     for term in weight.iterkeys():
         if weight[term] > 0.25:
             QPrime.append(term)
     return  QPrime