Пример #1
0
    def getScores(self, query):
        """
            query : texte de la requête
            renvoie un dictionnnaire contenant le score de chaque document pour la requête
        """
        ps = PorterStemmer()
        #stemmer les mots de la requête

        requete = dict(ps.getTextRepresentation(query))
        obj_index = self._index
        idsDoc = []
        dicoScores = dict()
        dicoTfsTermesInQueryColl = dict()
        totalTfsCol = 0
        #calculer le tf de chaque terme par rapport à l'ensemble de la collection
        for t in obj_index.getIndexInv().keys():
            #tf(t,collection) = nb d'occurence de t dans la collection
            tfTermeCollection = 0
            #dico de la forme {numero_doc:nb_occurence de t dans le doc}
            tfInDocs = obj_index.getTfsForStem(t)
            #calcul du tfTotal dans la collection
            for num_doc in tfInDocs.keys():
                tfTermeCollection += tfInDocs[num_doc]
            totalTfsCol += tfTermeCollection
            #si t est dans la requete, l'ajouter dans un dictionnaire
            #on en aura besoin pour calculer le score des documents
            if t in requete.keys():
                dicoTfsTermesInQueryColl[t] = tfTermeCollection

        #récupérer la liste des documents qui contiennent au moins un des termes de la requête
        for term in requete.keys():
            tfs_term = obj_index.getTfsForStem(term)
            #vérifier que le terme est dans l'index inversé
            if tfs_term != 0:
                for num_doc in tfs_term.keys():
                    if num_doc not in idsDoc:
                        idsDoc.append(num_doc)
        #calculer le score des documents pour chaque terme de la requete
        # d'abord pour chaque document, récupérer l'ensemble des termes qui le compose ainsi que leur tf
        for num_doc in idsDoc:
            TotalTfTerms = 0
            dictTfTermesDoc = dict()
            tfsForDoc = obj_index.getTfsForDoc(num_doc)
            score_doc = 1
            #ensuite pour chaque terme de l'index inversé calculer son tf s'il est présent dans le document dont l'id est num_doc
            #ajouter son tf au tf total des termes du document
            for terme in obj_index.getIndexInv().keys():
                #tfsForDoc de la forme {terme: tf(t,d)}
                if terme in tfsForDoc.keys():
                    tfTermeDoc = tfsForDoc[terme]
                    dictTfTermesDoc[terme] = tfTermeDoc
                    TotalTfTerms += tfTermeDoc
            #calculer le ratio tf(terme,doc)/TotalTfTerms score du doc
            for stem, tf in dictTfTermesDoc.items():
                if stem in requete.keys():
                    score_doc *= tf / TotalTfTerms + dicoTfsTermesInQueryColl[
                        stem] / totalTfsCol
                    dicoScores[num_doc] = score_doc
        return dicoScores
Пример #2
0
 def getWeigtsForQuery(self, query):
     ps = PorterStemmer()
     nb_doc = len(self.index.docs)
     tf_q = ps.getTextRepresentation(query)
     return {
         stem: np.log(nb_doc /
                      (1 + len(self.index.getTfsForStem(stem).values()[0])))
         for stem in tf_q.keys()
     }
 def getFeatures(self, idDoc, query):
     ps = PorterStemmer()
     tf_query = ps.getTextRepresentation(query).values()
     querykey = tuple(tf_query.items())
     if querykey not in self.featuresqueryidflen.keys():
         idf_query = weighter.WeigtsForQuery(query)
         idf_sum = np.sum(idf_query.values())
         len_query = np.sum(querykey)
         self.featuresqueryidflen[querykey] = (idf_sum, len_query)
     return self.featuresqueryidflen[querykey]
Пример #4
0
    def getScores(self, query):
        """
            query : la requête
                renvoie le score des documents en utilisant le modèle vectoriel
        """
        dictScoresDocs = dict()
        dictWeightDocs = dict(
        )  # dict contenant les poids des termes des docs qui les contiennent
        ps = PorterStemmer()
        query_stemmed = ps.getTextRepresentation(query)
        for terme in query_stemmed.keys():
            weightDocs = self._weighter.getWeightsForStem(terme)
            #print(" voici weight docs : ",weightDocs)
            dictWeightDocs[terme] = weightDocs
        q = self._weighter.getWeightsForQuery(query_stemmed)
        #print (" i am q : ",q)
        for term, value in dictWeightDocs.items():
            for doc, occ in value.items():
                if doc in dictScoresDocs.keys():
                    #print("term : ",term)
                    #print("value : ",value)
                    #print("doc : ",doc)
                    #print("occ : ",occ)
                    dictScoresDocs[doc] += q[term] * occ
                    #print(" q terme ",q[term])
                    #print(" dict scores docs : ",dictScoresDocs)

                else:
                    if Utils.isPresent(q, term):
                        dictScoresDocs[doc] = q[term] * occ

        #print(" dict score doc :",dictScoresDocs)
        if self._normalized == False:  # score produit scalaire
            return dictScoresDocs
        else:  # similarité cosinus
            norme_q = 0
            for terme, value in q.items():
                if terme in query_stemmed:
                    norme_q += math.pow(value, 2)
            norme_q = math.sqrt(norme_q)
            norme_d = dict()

            score_d = 0
            for cle in dictScoresDocs.keys():
                dictTerms = self._weighter.getWeightsForDoc(cle)
                for key, value in dictTerms.items():
                    if value != 0:
                        score_d += math.pow(value, 2)
                norme_d[cle] = math.sqrt(score_d)
                score_d = 0
            dictScoreCos = {
                doc: score / ((norme_q) * (norme_d[doc]))
                for doc, score in dictScoresDocs.items()
            }
            return dictScoreCos
 def getFeatures(self, idDoc, query):
     ps = PorterStemmer()
     tf_query = ps.getTextRepresentation(query)
     querykey = tuple(tf_query.items())
     #         ranking is a list dictionary  query is just text
     if querykey not in self.featuredocquery.keys() or (idDoc not in [
             f[0] for f in self.featuredocquery[querykey]
     ]):
         ranking = self.model.getRanking(query)
         self.featuredocquery[querykey] = ranking
     return [f[1] for f in self.featuredocquery[querykey]
             if f[0] == idDoc][0]
Пример #6
0
class GroundTruthParser(object):
    """Class for query reading from file""" 
    def __init__(self, query_file, relevance_file):
        self.query = open(query_file, 'r')
        self.textRepresenter = PorterStemmer()
        
        #init boolean to be able to close source files
        self.already_closed = False
        
        #Create parser to read query_file
        #WARNING WILL ONLY WORK ON CACM DATASET TODO FIND SOLUTION
        self.parser = ParserCLEF08()
        self.parser.initFile(query_file)
        
        #Build a dictionary (query_id, list of relevant documents) 
        self.relevant_docs = {}
        with open(relevance_file, 'r') as f:
            for line in f:
                data = line.split(" ")
                #print "see data",data
                if data[0]=="#" or data[0]=="" or data[0]=="\n":
                    continue
                query_id = int(data[0])
                if(not self.relevant_docs.has_key(query_id)):
                    self.relevant_docs[query_id] = []
                #A list is added per relevant doc for later use of couple (themes, score) 
                self.relevant_docs.get(query_id).append([ int(data[1]), int(data[2]), int(data[3][:-1])])
                
    def nextQuery(self):
        """Return next Query object"""
        
        query_data = self.parser.nextDocument()
        
        if (query_data == None):
            if( not self.already_closed ):
                self.query.close()
                self.already_closed = True
                return -1
        
        
        query_id = query_data.getId()
        #champs = self.relevant_docs.get(int(query_id))
        #doc_id = champs[1]
        query_text = query_data.getText()
        query_tf = self.textRepresenter.getTextRepresentation(query_text)
        relevance= np.array(self.relevant_docs.get(int(query_id))) #,[[None,None,None]]))
        #subtopic_id = champs[3]
        
        return GroundTruth(query_id, query_text, query_tf, relevance)
Пример #7
0
    def elementsFromDoc(self, doc):
        
        elements = {}
        text = doc.getText()

        # preprocessing
        text = text.lower()
        text = re.sub(r'(!|#|"|%|\$|\'|&|\)|\(|\+|\*|(^| )(-( |$))+|,|/|\.|;|:|=|<|\?|>|@|[|]|\|_|^|`|{|}|\||~)', ' ', text)
        text = re.sub(r'\n', ' ', text)
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'(^| )(\w($| ))+', ' ', text)
        text = re.sub(r' +', ' ', text)
        
        stemmer = PorterStemmer()

        return stemmer.getTextRepresentation(text)
Пример #8
0
 def indexation(self,collection):
     
     """
         Partie indexation normale
     """
     index = dict()
     ind_rev = dict()
     for i in range(1,len(collection)+1):
         doc = collection[i]
         pS = PorterStemmer()
         index[i] = pS.getTextRepresentation(doc.T)
         for key,item in index[i].items():
             if key not in ind_rev.keys():
                 ind_rev.update({key:{i:item}})
             else:
                 ind_rev[key][i] = item
     self.setIndex(index)
     self.setIndexInv(ind_rev)
     
     """
         Partie indexation normalisée
     """
 
     indexNormalise = dict()
     indexInvNormalise = dict()
     somme = 0
     
     for key,dico in self._index.items():
         for keyDico, itemDico in dico.items():
             somme += itemDico
         dictInDict= dict()
         dictInDictInv = dict()
         for keyDicoBis, itemDicoBis in dico.items():
             if (somme != 0):
                 dictInDict[keyDicoBis] = itemDicoBis / somme
                 indexNormalise[key] = dictInDict
                 dictInDictInv[key] = itemDicoBis / somme
                 indexInvNormalise[keyDicoBis] = dictInDictInv
             else:
                 dictInDict[keyDicoBis] = itemDicoBis
                 indexNormalise[key] = dictInDict
                 dictInDictInv[key] = itemDicoBis
                 indexInvNormalise[keyDicoBis] = dictInDictInv
         somme = 0
     self.setIndexNormalise(indexNormalise)
     self.setIndexInvNormalise(indexInvNormalise)
Пример #9
0
    def getScores(self, query):
        ps = PorterStemmer()
        requete = ps.getTextRepresentation(query)  # {mot1 : nb1 , mot2 : nb2}
        nb_documents = len(
            self._index.getIndex())  # nombre de documents dans la collection
        listScores = []
        k1 = 1.2
        b = 0.75

        indexInv = self._index.getIndexInv()
        index = self._index.getIndex()

        count = 0  # nb total d'occurences
        score = 0

        for key, dico in index.items():
            for keyDico, itemDico in dico.items():
                count += itemDico

        avg = count / len(index)
        """
        for keyIndex, itemIndex in index.items():
        
            for key,item in dict(requete).items():
                
                idf = log((1+nb_documents)/(1+len(self._index.getTfsForStem(key))))
                f_qi_d = indexInv[key][keyIndex] 
                D = sum(self._index.getTfsForDoc(keyIndex)[term] for term in self._index.getTfsForDoc(keyIndex).keys())
                score += idf * ( ( f_qi_d * (k1+1)) / f_qi_d + k1 * ( 1 - b + b * ( D / avg ) ) )
            
            listScores[i-1] = score 
            i = i + 1
            
        """
        """
        for keyIndexInv, itemIndexInv in indexInv.items():
            
            if keyIndexInv in requete:
                
                for keyDico, itemDico in itemIndexInv.items():
                    
                    idf = log( ( 1 + nb_documents ) / ( 1 + len(itemIndexInv)) )
                    
                    print("idf",idf)
                    
                    f_qi_d = itemDico
                    
                    print("f",f_qi_d)
                    
                    D = sum(self._index.getTfsForDoc(keyDico)[term] for term in self._index.getTfsForDoc(keyDico).keys())
                    
                    print("D",D)
                    
                    score += idf * ( ( f_qi_d * (k1+1)) / f_qi_d + k1 * ( 1 - b + b * ( D / avg ) ) )    
                    
                    print("score",score)
                    
                    listScores.append((keyDico,score))
                    
                    score = 0        
        """
        dictScores = dict()
        c = 1
        liste_id = []
        #print("aaa : ",dict(requete).items())
        for mot, occu in dict(requete).items():
            if mot in indexInv.keys():
                #print("MOT : ",mot)
                #print("indexinv : ",indexInv[mot].items())
                for key, item in indexInv[mot].items():
                    #print("key : ",key)
                    #print("item : ",item)
                    idf = log((1 + nb_documents) / (1 + len(indexInv[mot])))

                    f_qi_d = item

                    D = sum(
                        self._index.getTfsForDoc(key)[term]
                        for term in self._index.getTfsForDoc(key).keys())
                    #print("c : ",c)
                    c += 1
                    #print("idf : ",idf)
                    #print("f_qi_d : ",f_qi_d)
                    #print("D : ",D)

                    score = idf * ((f_qi_d * 1) / (f_qi_d + k1 * (1 - b + b *
                                                                  (D / avg))))

                    #print("avg : ",avg)
                    #print("score : ",score)

                    #print("key : ",key)
                    #print("list id : ",liste_id)

                    if key in liste_id:

                        dictScores[key] += score
                    else:
                        dictScores[key] = score
                        liste_id.append(key)

                    score = 0

        return dictScores
Пример #10
0
 def getWeigtsForQuery(self, query):
     ps = PorterStemmer()
     return ps.getTextRepresentation(query)
Пример #11
0
 def getWeigtsForQuery(self, query):
     ps = PorterStemmer()
     self.Weights_Query = {}.fromkeys(
         ps.getTextRepresentation(query).keys(), 1)
     return self.Weights_Query