Exemplo n.º 1
0
    def search(self, query, queryExpansionParameter):
        orderedQueryResults                      = self.tfidf(query);
        if orderedQueryResults is None: return None;
        mostRelevantDocumentId                   = orderedQueryResults.getHead().docId;
        mostRelevantDocumentIdDocumentCollection = int(self._documentIds[mostRelevantDocumentId]);
        orderedExpansionTerms                    = LinkedList();
        orderedExpansionTerms.printSep           = "\n";

        uniqueTerms  = {};
        currentDoc   = self._docCollect.nextDocument();
        currentDocId = int(currentDoc.getName());
        while currentDocId != mostRelevantDocumentIdDocumentCollection:
            currentDoc   = self._docCollect.nextDocument();
            currentDocId = int(currentDoc.getName());

        # current doc now has the document that is the highest ranked tfid result
        self._tokenizer.loadDocument(currentDoc);

        # generate unique query term set
        token, position = self._tokenizer.nextToken()
        while token is not None:
            uniqueTerms[token] = position;
            token, position = self._tokenizer.nextToken();

        # calculate tfidf scores for each token and insert into ordered linked list
        for token, pos in uniqueTerms.iteritems():
            tfidf = self.tfidf_term(token, mostRelevantDocumentIdDocumentCollection);
            orderedExpansionTerms.insertSorted(QueryResultItem(token, tfidf));

        # expand query with most relevant query terms:
        currentNode = orderedExpansionTerms._head;
        for i in xrange(0,queryExpansionParameter):
            currentData = currentNode.value.docId;
            query += " " + currentData;
            currentNode = currentNode.next;

        # perform search with appended query

        results = self.tfidf(query);
        current = results._head;
        rank    = 0;

        print("\n\nComplete query:");
        print(query)
        print("results:\n")

        while current is not None:
            print str(self._currentQueryId) + " 0 " + str(current.value.docId) + " " + str(rank) + " " + str(current.value.tfidf) + " 0";
            current = current.next;
            rank += 1;

        self._currentQueryId += 1;
Exemplo n.º 2
0
 def tfidf(self, query):
     queryTokens  = self._queryTokenizer.tokenize(query);
     postingLists = self._getPostingLists(queryTokens);
     documentScores = {};
     # create document scores
     for term in postingLists:
         if term is None: return;
         for doc in term:
             termFrequency = 1 + math.log10(float(doc.count) / float(self.termFrequencies[int(doc.doc) - 1]))
             docSocre = termFrequency * self._getIDF(term);
             if str(doc.doc) in documentScores.keys(): documentScores[str(doc.doc)] += docSocre;
             else: documentScores[str(doc.doc)] = docSocre;
     # create an ordered list of documents, ordered by document score
     orderedQueryResults = LinkedList();
     orderedQueryResults.printSep = "\n";
     for docId, ds in documentScores.iteritems():
         orderedQueryResults.insertSorted(QueryResultItem(docId, ds));
     return orderedQueryResults