def extract_concepts(self): ''' Concepts are extracted from each document. for the extracted concepts, Predecessors are extracted. We need to keep 'Already Extracted' List of Go concepts so that we avoid extracting them again OR we use a try except when we are inserting the concept to avoid duplicate! we well pay the cost of duplicate extraction though! ''' db = DB() Collection._load() terminology_list = ["go"]#,"mesh","icd10","snomed"] extracted_doc = 0 for terminology in terminology_list: MaxMatcher = dict() for doc in Collection._documents: extracted_doc += 1 print "extracted_doc: ",extracted_doc , ' id:', doc.PMID document = doc.abstract # document is the abstract text. concepts = self._extract_concepts(document,terminology,MaxMatcher) if len(concepts)>0: concept_id_list = ','.join(concepts) if terminology == 'go': self.AddGeneOntologyConceptPredecessors(doc.PMID,concepts) query = "Insert into collection_go(PMID,go_id_list) values ('"+doc.PMID+"',' "+concept_id_list+"');" try: print query db.Query(query) except: print ""#"Unexpected error:", sys.exc_info()[0]
def retrive_documents(self,query_id): k1 = 1.2 k3 = 8.00 avg_dl = 122 b = 1 # from 0.25 to 2.00 increase 0.25 q = Query(query_id) #q.set_concepts(self.QueryConceptExtraction(q.text)) self._expand_query(q) return print "Retrieving Documents for: ", q.text Collection._load() Collection._load_go() Collection._load_tags() Collection._load_indexes() #Loads documents into _documents with PMID and Index score = dict() N = Collection._count Nt = dict() for term in q.text: Nt[term] = Collection._get_frequency(term) counter = 0 for doc in Collection._documents: summation = 0; dl = doc.length * 1.00 for t in q.text: tfn = doc.get_frequency(t) QQ = ' '.join(q.text) qtf = Document._term_frequency(QQ, t) K = k1*((1-b)+b*(dl/avg_dl)) w = log((N-Nt[t]+0.5)/(Nt[t]+0.5),2) if w<0: #this makes the result a negative number # if we break the result will be bigger than or equal to zero break p1 = (((k1+1)*tfn)/(K+tfn)) p2 = ((k3+1)*qtf/(k3+qtf)) p3 = w summation += p1*p2*p3 score[doc.PMID] = summation counter += 1
def Indexing(self): ''' IR Indexing Operations - Elimination of Stopwords - ''' DB._execute("DELETE from collection_index") print "Indexing is started..." tp = TextProcessor() Collection._load() Collection._load_tags() #loading document with PMID, tags and abstracts for doc in Collection._documents: index_list = [] for term in doc.abstract: index_list.append(term) if GlobalVariables.global_context_activated: for term in doc.tag: index_list.append(term) index_list = tp.EliminateStopWords(index_list) index_list = tp.Stem(index_list) doc.set_index(index_list) print "Indexing is Done!"
def DocumentExpantion(self): ''' db.Query("delete from collection_concepts;")!!! BM25TermWeightingModel BM25 or Best Match algorithm, calculates the weight of each word in each extracted concept for the document ''' print "Calculating weights is started..." wieght_threshold = 0.10 tp = TextProcessor() ontology = Ontology() db = DB() db.Query("delete from collection_concepts;") Collection._load() Collection._load_go() N = Collection._count #Terminologies are ('go','mesh','icd10','snomed') corresponding with columns 2,3,4,5 T = ontology.GetDict('go') #bring all ontologies into the memory to be faster! doc_avg_len = 122 k1 = 1.2 b = 1.00 doc_counter = 0 print Collection._count # tuning parameters! for d in Collection._documents: doc_counter += 1 doc_len = d.length weight = dict() for C in d.go: C = C.replace(' ','') # Extract concept variants for C var = ' ' for variant in T[C]: var += ' {0} '.format(variant) terms = set( var.split(tp.WordSplitter())) tp.remove_values_from_list(terms,'') l = len(terms) sumation = 0 for term in terms: term_weight = 0 #calculate the weight tf = d.get_frequency(term) #Here goes calculating the weight n_k = Collection._get_frequency(term) tf = d.get_frequency(term) try: term_weight = tf * (( log10((N-n_k+0.50)/(n_k+0.50)) )/(k1+((1-b)+b) * (doc_len/doc_avg_len)+(tf))) except: pass #print "One here!++++++++++++++++++++++++++++++++++" sumation += term_weight if (sumation/l) > wieght_threshold: weight[C] = (1.00/l) * sumation # Store concepts and weights in the database, concepts and their weights are semi-colon separated values = '' ConceptList = [] for row in weight: row = row.replace(" ",'') for term in T[row]: ConceptList.append(term) if values == '': values = str(row) + ';' + str(weight[row]) else: values += ',' + str(row) + ';' + str(weight[row]) d.set_tag(ConceptList) #Adding tag tags to documents query = 'Insert into collection_concepts (PMID, Concepts) values({0}, "{1}")'.format(d.PMID,values) #print query db.Query(query) print "Calculating weights is Done! Concepts are added to Database"