def getScores(self, query): """ query : texte de la requête renvoie un dictionnnaire contenant le score de chaque document pour la requête """ ps = PorterStemmer() #stemmer les mots de la requête requete = dict(ps.getTextRepresentation(query)) obj_index = self._index idsDoc = [] dicoScores = dict() dicoTfsTermesInQueryColl = dict() totalTfsCol = 0 #calculer le tf de chaque terme par rapport à l'ensemble de la collection for t in obj_index.getIndexInv().keys(): #tf(t,collection) = nb d'occurence de t dans la collection tfTermeCollection = 0 #dico de la forme {numero_doc:nb_occurence de t dans le doc} tfInDocs = obj_index.getTfsForStem(t) #calcul du tfTotal dans la collection for num_doc in tfInDocs.keys(): tfTermeCollection += tfInDocs[num_doc] totalTfsCol += tfTermeCollection #si t est dans la requete, l'ajouter dans un dictionnaire #on en aura besoin pour calculer le score des documents if t in requete.keys(): dicoTfsTermesInQueryColl[t] = tfTermeCollection #récupérer la liste des documents qui contiennent au moins un des termes de la requête for term in requete.keys(): tfs_term = obj_index.getTfsForStem(term) #vérifier que le terme est dans l'index inversé if tfs_term != 0: for num_doc in tfs_term.keys(): if num_doc not in idsDoc: idsDoc.append(num_doc) #calculer le score des documents pour chaque terme de la requete # d'abord pour chaque document, récupérer l'ensemble des termes qui le compose ainsi que leur tf for num_doc in idsDoc: TotalTfTerms = 0 dictTfTermesDoc = dict() tfsForDoc = obj_index.getTfsForDoc(num_doc) score_doc = 1 #ensuite pour chaque terme de l'index inversé calculer son tf s'il est présent dans le document dont l'id est num_doc #ajouter son tf au tf total des termes du document for terme in obj_index.getIndexInv().keys(): #tfsForDoc de la forme {terme: tf(t,d)} if terme in tfsForDoc.keys(): tfTermeDoc = tfsForDoc[terme] dictTfTermesDoc[terme] = tfTermeDoc TotalTfTerms += tfTermeDoc #calculer le ratio tf(terme,doc)/TotalTfTerms score du doc for stem, tf in dictTfTermesDoc.items(): if stem in requete.keys(): score_doc *= tf / TotalTfTerms + dicoTfsTermesInQueryColl[ stem] / totalTfsCol dicoScores[num_doc] = score_doc return dicoScores
def getWeigtsForQuery(self, query): ps = PorterStemmer() nb_doc = len(self.index.docs) tf_q = ps.getTextRepresentation(query) return { stem: np.log(nb_doc / (1 + len(self.index.getTfsForStem(stem).values()[0]))) for stem in tf_q.keys() }
def getFeatures(self, idDoc, query): ps = PorterStemmer() tf_query = ps.getTextRepresentation(query).values() querykey = tuple(tf_query.items()) if querykey not in self.featuresqueryidflen.keys(): idf_query = weighter.WeigtsForQuery(query) idf_sum = np.sum(idf_query.values()) len_query = np.sum(querykey) self.featuresqueryidflen[querykey] = (idf_sum, len_query) return self.featuresqueryidflen[querykey]
def getScores(self, query): """ query : la requête renvoie le score des documents en utilisant le modèle vectoriel """ dictScoresDocs = dict() dictWeightDocs = dict( ) # dict contenant les poids des termes des docs qui les contiennent ps = PorterStemmer() query_stemmed = ps.getTextRepresentation(query) for terme in query_stemmed.keys(): weightDocs = self._weighter.getWeightsForStem(terme) #print(" voici weight docs : ",weightDocs) dictWeightDocs[terme] = weightDocs q = self._weighter.getWeightsForQuery(query_stemmed) #print (" i am q : ",q) for term, value in dictWeightDocs.items(): for doc, occ in value.items(): if doc in dictScoresDocs.keys(): #print("term : ",term) #print("value : ",value) #print("doc : ",doc) #print("occ : ",occ) dictScoresDocs[doc] += q[term] * occ #print(" q terme ",q[term]) #print(" dict scores docs : ",dictScoresDocs) else: if Utils.isPresent(q, term): dictScoresDocs[doc] = q[term] * occ #print(" dict score doc :",dictScoresDocs) if self._normalized == False: # score produit scalaire return dictScoresDocs else: # similarité cosinus norme_q = 0 for terme, value in q.items(): if terme in query_stemmed: norme_q += math.pow(value, 2) norme_q = math.sqrt(norme_q) norme_d = dict() score_d = 0 for cle in dictScoresDocs.keys(): dictTerms = self._weighter.getWeightsForDoc(cle) for key, value in dictTerms.items(): if value != 0: score_d += math.pow(value, 2) norme_d[cle] = math.sqrt(score_d) score_d = 0 dictScoreCos = { doc: score / ((norme_q) * (norme_d[doc])) for doc, score in dictScoresDocs.items() } return dictScoreCos
def getFeatures(self, idDoc, query): ps = PorterStemmer() tf_query = ps.getTextRepresentation(query) querykey = tuple(tf_query.items()) # ranking is a list dictionary query is just text if querykey not in self.featuredocquery.keys() or (idDoc not in [ f[0] for f in self.featuredocquery[querykey] ]): ranking = self.model.getRanking(query) self.featuredocquery[querykey] = ranking return [f[1] for f in self.featuredocquery[querykey] if f[0] == idDoc][0]
class GroundTruthParser(object): """Class for query reading from file""" def __init__(self, query_file, relevance_file): self.query = open(query_file, 'r') self.textRepresenter = PorterStemmer() #init boolean to be able to close source files self.already_closed = False #Create parser to read query_file #WARNING WILL ONLY WORK ON CACM DATASET TODO FIND SOLUTION self.parser = ParserCLEF08() self.parser.initFile(query_file) #Build a dictionary (query_id, list of relevant documents) self.relevant_docs = {} with open(relevance_file, 'r') as f: for line in f: data = line.split(" ") #print "see data",data if data[0]=="#" or data[0]=="" or data[0]=="\n": continue query_id = int(data[0]) if(not self.relevant_docs.has_key(query_id)): self.relevant_docs[query_id] = [] #A list is added per relevant doc for later use of couple (themes, score) self.relevant_docs.get(query_id).append([ int(data[1]), int(data[2]), int(data[3][:-1])]) def nextQuery(self): """Return next Query object""" query_data = self.parser.nextDocument() if (query_data == None): if( not self.already_closed ): self.query.close() self.already_closed = True return -1 query_id = query_data.getId() #champs = self.relevant_docs.get(int(query_id)) #doc_id = champs[1] query_text = query_data.getText() query_tf = self.textRepresenter.getTextRepresentation(query_text) relevance= np.array(self.relevant_docs.get(int(query_id))) #,[[None,None,None]])) #subtopic_id = champs[3] return GroundTruth(query_id, query_text, query_tf, relevance)
def elementsFromDoc(self, doc): elements = {} text = doc.getText() # preprocessing text = text.lower() text = re.sub(r'(!|#|"|%|\$|\'|&|\)|\(|\+|\*|(^| )(-( |$))+|,|/|\.|;|:|=|<|\?|>|@|[|]|\|_|^|`|{|}|\||~)', ' ', text) text = re.sub(r'\n', ' ', text) text = re.sub(r'\d+', '', text) text = re.sub(r'(^| )(\w($| ))+', ' ', text) text = re.sub(r' +', ' ', text) stemmer = PorterStemmer() return stemmer.getTextRepresentation(text)
def indexation(self,collection): """ Partie indexation normale """ index = dict() ind_rev = dict() for i in range(1,len(collection)+1): doc = collection[i] pS = PorterStemmer() index[i] = pS.getTextRepresentation(doc.T) for key,item in index[i].items(): if key not in ind_rev.keys(): ind_rev.update({key:{i:item}}) else: ind_rev[key][i] = item self.setIndex(index) self.setIndexInv(ind_rev) """ Partie indexation normalisée """ indexNormalise = dict() indexInvNormalise = dict() somme = 0 for key,dico in self._index.items(): for keyDico, itemDico in dico.items(): somme += itemDico dictInDict= dict() dictInDictInv = dict() for keyDicoBis, itemDicoBis in dico.items(): if (somme != 0): dictInDict[keyDicoBis] = itemDicoBis / somme indexNormalise[key] = dictInDict dictInDictInv[key] = itemDicoBis / somme indexInvNormalise[keyDicoBis] = dictInDictInv else: dictInDict[keyDicoBis] = itemDicoBis indexNormalise[key] = dictInDict dictInDictInv[key] = itemDicoBis indexInvNormalise[keyDicoBis] = dictInDictInv somme = 0 self.setIndexNormalise(indexNormalise) self.setIndexInvNormalise(indexInvNormalise)
def getScores(self, query): ps = PorterStemmer() requete = ps.getTextRepresentation(query) # {mot1 : nb1 , mot2 : nb2} nb_documents = len( self._index.getIndex()) # nombre de documents dans la collection listScores = [] k1 = 1.2 b = 0.75 indexInv = self._index.getIndexInv() index = self._index.getIndex() count = 0 # nb total d'occurences score = 0 for key, dico in index.items(): for keyDico, itemDico in dico.items(): count += itemDico avg = count / len(index) """ for keyIndex, itemIndex in index.items(): for key,item in dict(requete).items(): idf = log((1+nb_documents)/(1+len(self._index.getTfsForStem(key)))) f_qi_d = indexInv[key][keyIndex] D = sum(self._index.getTfsForDoc(keyIndex)[term] for term in self._index.getTfsForDoc(keyIndex).keys()) score += idf * ( ( f_qi_d * (k1+1)) / f_qi_d + k1 * ( 1 - b + b * ( D / avg ) ) ) listScores[i-1] = score i = i + 1 """ """ for keyIndexInv, itemIndexInv in indexInv.items(): if keyIndexInv in requete: for keyDico, itemDico in itemIndexInv.items(): idf = log( ( 1 + nb_documents ) / ( 1 + len(itemIndexInv)) ) print("idf",idf) f_qi_d = itemDico print("f",f_qi_d) D = sum(self._index.getTfsForDoc(keyDico)[term] for term in self._index.getTfsForDoc(keyDico).keys()) print("D",D) score += idf * ( ( f_qi_d * (k1+1)) / f_qi_d + k1 * ( 1 - b + b * ( D / avg ) ) ) print("score",score) listScores.append((keyDico,score)) score = 0 """ dictScores = dict() c = 1 liste_id = [] #print("aaa : ",dict(requete).items()) for mot, occu in dict(requete).items(): if mot in indexInv.keys(): #print("MOT : ",mot) #print("indexinv : ",indexInv[mot].items()) for key, item in indexInv[mot].items(): #print("key : ",key) #print("item : ",item) idf = log((1 + nb_documents) / (1 + len(indexInv[mot]))) f_qi_d = item D = sum( self._index.getTfsForDoc(key)[term] for term in self._index.getTfsForDoc(key).keys()) #print("c : ",c) c += 1 #print("idf : ",idf) #print("f_qi_d : ",f_qi_d) #print("D : ",D) score = idf * ((f_qi_d * 1) / (f_qi_d + k1 * (1 - b + b * (D / avg)))) #print("avg : ",avg) #print("score : ",score) #print("key : ",key) #print("list id : ",liste_id) if key in liste_id: dictScores[key] += score else: dictScores[key] = score liste_id.append(key) score = 0 return dictScores
def getWeigtsForQuery(self, query): ps = PorterStemmer() return ps.getTextRepresentation(query)
def getWeigtsForQuery(self, query): ps = PorterStemmer() self.Weights_Query = {}.fromkeys( ps.getTextRepresentation(query).keys(), 1) return self.Weights_Query