示例#1
0
 def extract_concepts(self):
     '''
     Concepts are extracted from each document.
     for the extracted concepts, Predecessors are extracted.
     We need to keep 'Already Extracted' List of 
     Go concepts so that we avoid extracting them again
     
     OR
     
     we use a try except when we are inserting the concept to avoid duplicate!
     we well pay the cost of duplicate extraction though!
     '''
     db = DB()
     Collection._load()
     terminology_list = ["go"]#,"mesh","icd10","snomed"]
     extracted_doc = 0
     for terminology in terminology_list:
         MaxMatcher = dict()
         for doc in Collection._documents:  
             extracted_doc += 1
             print "extracted_doc: ",extracted_doc , ' id:', doc.PMID
             document = doc.abstract                           # document is the abstract text.
             concepts = self._extract_concepts(document,terminology,MaxMatcher)
             if len(concepts)>0:
                 concept_id_list = ','.join(concepts)
                 if terminology == 'go':
                     self.AddGeneOntologyConceptPredecessors(doc.PMID,concepts)
                 query = "Insert into collection_go(PMID,go_id_list) values ('"+doc.PMID+"',' "+concept_id_list+"');"
                 try:
                     print query
                     db.Query(query)
                 except:
                     print ""#"Unexpected error:", sys.exc_info()[0]
示例#2
0
 def retrive_documents(self,query_id):
     k1 = 1.2
     k3 = 8.00
     avg_dl = 122
     b = 1 # from 0.25 to 2.00 increase 0.25
     q = Query(query_id)
     #q.set_concepts(self.QueryConceptExtraction(q.text))
     self._expand_query(q)
     return
     print "Retrieving Documents for: ", q.text
     Collection._load()
     Collection._load_go()
     Collection._load_tags()
     Collection._load_indexes()      #Loads documents into _documents with PMID and Index
     score = dict()
     N = Collection._count
     Nt = dict()
     for term in q.text:
         Nt[term] = Collection._get_frequency(term)
     counter = 0
     for doc in Collection._documents:
         summation = 0;
         dl = doc.length * 1.00
         for t in q.text:
             tfn = doc.get_frequency(t)
             QQ = ' '.join(q.text)
             qtf = Document._term_frequency(QQ, t)
             K = k1*((1-b)+b*(dl/avg_dl))
             w = log((N-Nt[t]+0.5)/(Nt[t]+0.5),2)
             if w<0:
                 #this makes the result a negative number
                 # if we break the result will be bigger than or equal to zero
                 break
             p1 = (((k1+1)*tfn)/(K+tfn))
             p2 = ((k3+1)*qtf/(k3+qtf))
             p3 = w
             summation += p1*p2*p3
         score[doc.PMID] = summation
         counter += 1
示例#3
0
 def Indexing(self):
     '''
     IR Indexing Operations
         - Elimination of Stopwords
         - 
     '''
     DB._execute("DELETE from collection_index")
     print "Indexing is started..."
     tp = TextProcessor() 
     Collection._load()
     Collection._load_tags() #loading document with PMID, tags and abstracts
     for doc in Collection._documents:
         index_list = []
         for term in doc.abstract:
             index_list.append(term)
         if GlobalVariables.global_context_activated:
             for term in doc.tag:
                 index_list.append(term)
         index_list = tp.EliminateStopWords(index_list)
         index_list = tp.Stem(index_list)
         doc.set_index(index_list)
     print "Indexing is Done!"       
示例#4
0
 def DocumentExpantion(self):
     '''
     db.Query("delete from collection_concepts;")!!!
     
     BM25TermWeightingModel
     BM25 or Best Match algorithm, calculates the weight of 
     each word in each extracted concept for the document 
     '''
     print "Calculating weights is started..."
     wieght_threshold = 0.10
     tp = TextProcessor()
     ontology = Ontology()
     db = DB()
     db.Query("delete from collection_concepts;")
     Collection._load()
     Collection._load_go()
     N = Collection._count
     #Terminologies are ('go','mesh','icd10','snomed') corresponding with columns 2,3,4,5
     T = ontology.GetDict('go')    #bring all ontologies into the memory to be faster!
     doc_avg_len = 122
     k1 = 1.2 
     b = 1.00
     doc_counter = 0   
     print Collection._count                     # tuning parameters!
     for d in Collection._documents:
         doc_counter += 1
         doc_len = d.length
         weight = dict()
         for C in d.go:
             C = C.replace(' ','')
             # Extract concept variants for C
             var = ' '
             for variant in T[C]:
                 var += ' {0} '.format(variant)
             terms = set( var.split(tp.WordSplitter()))
             tp.remove_values_from_list(terms,'')
             l = len(terms)    
             sumation = 0  
             for term in terms:
                 term_weight = 0
                 #calculate the weight
                 tf = d.get_frequency(term)                     
                 #Here goes calculating the weight
                 n_k = Collection._get_frequency(term)
                 tf = d.get_frequency(term)
                 try:
                     term_weight = tf * (( log10((N-n_k+0.50)/(n_k+0.50)) )/(k1+((1-b)+b) * (doc_len/doc_avg_len)+(tf))) 
                 except:
                     pass
                     #print "One here!++++++++++++++++++++++++++++++++++"
                 sumation += term_weight
             if (sumation/l) > wieght_threshold:
                 weight[C] = (1.00/l) * sumation
         # Store concepts and weights in the database, concepts and their weights are semi-colon separated
         values = ''
         ConceptList = []
         for row in weight:
             row = row.replace(" ",'')
             for term in T[row]:
                 ConceptList.append(term)
             if values == '':
                 values = str(row) + ';' + str(weight[row])
             else:
                 values += ',' + str(row) + ';' + str(weight[row])
         d.set_tag(ConceptList)   #Adding tag tags to documents
         query = 'Insert into collection_concepts (PMID, Concepts) values({0}, "{1}")'.format(d.PMID,values)
         #print query
         db.Query(query)
     print "Calculating weights is Done! Concepts are added to Database"