Пример #1
0
def calcBM25Scores(termCount, myindex, query, coder):
    """BM25 Retrieval Model"""
    
    notTerms = query.get('NOT', [])
    andTerms = query.get('AND', [])
    orTerms = query.get('OR', [])

    termIndex = {}
    docids = []
    # Select all docs from OR and AND terms
    terms = orTerms[:]
    terms.extend(andTerms)
    for term in terms:
        tc = index.getTermContent(myindex, term, coder)
        termIndex[term] = tc
        docids.extend(tc.get('docs', []).keys())
    
    # Remove docs from NOT    
    exdocids = []
    for term in notTerms:
        tc = index.getTermContent(myindex, term, coder)
        exdocids.extend(tc.get('docs', []).keys())
    
    # Get docs in AND    
    indocids = []
    for term in andTerms:
        indocids.extend(termIndex[term].get('docs', []).keys())
        
    # Calc term frequency in query
    qFreq = {}
    for term in terms:
        qFreq[term] = qFreq.setdefault(term, 0) + 1
        
    scores = []
    for docID in docids:
        if docID in exdocids:
            continue
        
        if len(indocids) > 0 and docID not in indocids:
            continue
        
        K = k1 * ((1 - b) + (b * termCount.get(docID) / termCount.get('average')))
        docScore = 0.0
        for term in terms:
            tempScore = log10((termCount['totalDocs'] - termIndex[term].get('count', 0) + 0.5)
                             / (termIndex[term].get('count', 0) + 0.5))
            
            tempScore = tempScore * ((k1 + 1) * termIndex[term]['docs'].get(docID, 0.0)) / (K + termIndex[term]['docs'].get(docID, 0.0))
            
            tempScore = tempScore * (((k2 + 1) * qFreq[term]) / (k2 + qFreq[term]))
            docScore = docScore + tempScore
        scores.append([docID, docScore])	
    return scores,indocids,exdocids
Пример #2
0
def calcQLScores(termCount, myindex, query, coder):
    """Query Likelihood Retrieval Model"""
    notTerms = query.get('NOT', [])
    andTerms = query.get('AND', [])
    orTerms = query.get('OR', [])

    termIndex = {}
    docids = []
    # Select all docs from OR and AND terms
    terms = orTerms[:]
    terms.extend(andTerms)
    for term in terms:
        tc = index.getTermContent(myindex, term, coder)
        termIndex[term] = tc
        docids.extend(tc.get('docs', []).keys())
    
    # Remove docs from NOT    
    exdocids = []
    for term in notTerms:
        tc = index.getTermContent(myindex, term, coder)
        exdocids.extend(tc.get('docs', []).keys())

    # Get docs in AND    
    indocids = []
    for term in andTerms:
        indocids.extend(termIndex[term].get('docs', []).keys())
        
    scores = []
    for docID in docids:
        if docID in exdocids:
            continue
        
        if len(indocids) > 0 and docID not in indocids:
            continue
                    
        docScore = 0.0
        for term in terms:
            if termIndex[term]['count'] != 0:
                tf = termIndex[term]['docs'].get(docID, 0.0)
                docScore = docScore + log10((1 - LAMBDA) * (tf * 1.0 / termCount[docID]) + 
                                            LAMBDA * (termIndex[term]['count'] * 1.0 / termCount['total']))        
        scores.append([docID, docScore])	
    return scores,indocids,exdocids