Exemplo n.º 1
0
    class Doc():
        def __init__(self, doc):
            self.doc   = doc;
            self.lines = LinkedList();
            self.count = 1;
            self._iterator = None;
            self.documentFrequency = None;
        def reset(self):
            self.lines.reset();
            self._iterator = None;
        def __str__(self):
            return '\n\t' + str(self.doc) + ', count:' + str(self.count) + self.lines.toString(',');
        def __lt__(self, other):
            return self.doc < other.doc;
        def __eq__(self, other):
            return self.doc  == other.doc;

        def insertLine(self, line):
            line = IndexList.Line(int(line));
            self.lines.insertSorted(line);

        def __iter__(self):
            return self;
        def next(self):
            if self._iterator is None:
                if self.lines._head is None: raise StopIteration
                self._iterator = self.lines._head;
                return self._iterator.value;
            self._iterator = self._iterator.next;
            if self._iterator is None: raise StopIteration
            return self._iterator.value;
Exemplo n.º 2
0
    def search(self, query, queryExpansionParameter):
        orderedQueryResults                      = self.tfidf(query);
        if orderedQueryResults is None: return None;
        mostRelevantDocumentId                   = orderedQueryResults.getHead().docId;
        mostRelevantDocumentIdDocumentCollection = int(self._documentIds[mostRelevantDocumentId]);
        orderedExpansionTerms                    = LinkedList();
        orderedExpansionTerms.printSep           = "\n";

        uniqueTerms  = {};
        currentDoc   = self._docCollect.nextDocument();
        currentDocId = int(currentDoc.getName());
        while currentDocId != mostRelevantDocumentIdDocumentCollection:
            currentDoc   = self._docCollect.nextDocument();
            currentDocId = int(currentDoc.getName());

        # current doc now has the document that is the highest ranked tfid result
        self._tokenizer.loadDocument(currentDoc);

        # generate unique query term set
        token, position = self._tokenizer.nextToken()
        while token is not None:
            uniqueTerms[token] = position;
            token, position = self._tokenizer.nextToken();

        # calculate tfidf scores for each token and insert into ordered linked list
        for token, pos in uniqueTerms.iteritems():
            tfidf = self.tfidf_term(token, mostRelevantDocumentIdDocumentCollection);
            orderedExpansionTerms.insertSorted(QueryResultItem(token, tfidf));

        # expand query with most relevant query terms:
        currentNode = orderedExpansionTerms._head;
        for i in xrange(0,queryExpansionParameter):
            currentData = currentNode.value.docId;
            query += " " + currentData;
            currentNode = currentNode.next;

        # perform search with appended query

        results = self.tfidf(query);
        current = results._head;
        rank    = 0;

        print("\n\nComplete query:");
        print(query)
        print("results:\n")

        while current is not None:
            print str(self._currentQueryId) + " 0 " + str(current.value.docId) + " " + str(rank) + " " + str(current.value.tfidf) + " 0";
            current = current.next;
            rank += 1;

        self._currentQueryId += 1;
Exemplo n.º 3
0
    class Term():
        def __init__(self, term):
            self.term  = term;
            self.docs  = LinkedList();
            self.count = 1;
            self._iterator = None;
            self._temp = None;
        def __str__(self):
            return "Term: " + self.term + ', count:' + str(self.count) + self.docs.toString(',');
        def __lt__(self, other):
            return self.term <  other.term;
        def __eq__(self, other):
            return self.term  == other.term;
        def reset(self):
            self.docs.reset();
            self._iterator = None;
        def insertDoc(self, doc, line):
            obj = IndexList.Doc(doc);
            current = self.docs.getLastItemOf(obj);
            if current:
                current.count += 1;
            else:
                self.docs.insertSorted(obj);
                current = obj;
            current.insertLine(line);

        def __iter__(self):
            return self;
        def next(self):
            if self._temp is None:
                if self.terms is None: raise StopIteration
                self._temp = self.terms;
            if self._temp._currentNode is None: raise StopIteration
            value = self._temp._currentNode.value;
            self._temp._currentNode = self._temp._currentNode.next;
            return value;

        def __iter__(self):
            return self;
        def next(self):
            if self._iterator is None:
                if self.docs._head is None: raise StopIteration
                self._iterator = self.docs._head;
                return self._iterator.value;
            self._iterator = self._iterator.next;
            if self._iterator is None: raise StopIteration
            return self._iterator.value;
Exemplo n.º 4
0
 def tfidf(self, query):
     queryTokens  = self._queryTokenizer.tokenize(query);
     postingLists = self._getPostingLists(queryTokens);
     documentScores = {};
     # create document scores
     for term in postingLists:
         if term is None: return;
         for doc in term:
             termFrequency = 1 + math.log10(float(doc.count) / float(self.termFrequencies[int(doc.doc) - 1]))
             docSocre = termFrequency * self._getIDF(term);
             if str(doc.doc) in documentScores.keys(): documentScores[str(doc.doc)] += docSocre;
             else: documentScores[str(doc.doc)] = docSocre;
     # create an ordered list of documents, ordered by document score
     orderedQueryResults = LinkedList();
     orderedQueryResults.printSep = "\n";
     for docId, ds in documentScores.iteritems():
         orderedQueryResults.insertSorted(QueryResultItem(docId, ds));
     return orderedQueryResults
Exemplo n.º 5
0
class IndexList():
    def __init__(self):
        self.objs      = LinkedList();
        self._iterator = None;
    def __str__(self):
        return self.objs.toString('\n');
    def reset(self):
        self.objs.reset();
        self._iterator = None;
    def insert(self, string1, string2, string3):
        obj = IndexList.Term(string1);
        current = self.objs.getLastItemOf(obj);
        if current:
            current.count += 1;
        else:
            self.objs.insertSorted(obj);
            current = obj;
        current.insertDoc(string2, string3);
        return current;

    def __iter__(self):
        return self;
    def next(self):
        if self._iterator is None:
            if self.objs._head is None: raise StopIteration
            self._iterator = self.objs._head;
            return self._iterator.value;
        self._iterator = self._iterator.next;
        if self._iterator is None: raise StopIteration
        return self._iterator.value;

    class Term():
        def __init__(self, term):
            self.term  = term;
            self.docs  = LinkedList();
            self.count = 1;
            self._iterator = None;
            self._temp = None;
        def __str__(self):
            return "Term: " + self.term + ', count:' + str(self.count) + self.docs.toString(',');
        def __lt__(self, other):
            return self.term <  other.term;
        def __eq__(self, other):
            return self.term  == other.term;
        def reset(self):
            self.docs.reset();
            self._iterator = None;
        def insertDoc(self, doc, line):
            obj = IndexList.Doc(doc);
            current = self.docs.getLastItemOf(obj);
            if current:
                current.count += 1;
            else:
                self.docs.insertSorted(obj);
                current = obj;
            current.insertLine(line);

        def __iter__(self):
            return self;
        def next(self):
            if self._temp is None:
                if self.terms is None: raise StopIteration
                self._temp = self.terms;
            if self._temp._currentNode is None: raise StopIteration
            value = self._temp._currentNode.value;
            self._temp._currentNode = self._temp._currentNode.next;
            return value;

        def __iter__(self):
            return self;
        def next(self):
            if self._iterator is None:
                if self.docs._head is None: raise StopIteration
                self._iterator = self.docs._head;
                return self._iterator.value;
            self._iterator = self._iterator.next;
            if self._iterator is None: raise StopIteration
            return self._iterator.value;

    class Doc():
        def __init__(self, doc):
            self.doc   = doc;
            self.lines = LinkedList();
            self.count = 1;
            self._iterator = None;
            self.documentFrequency = None;
        def reset(self):
            self.lines.reset();
            self._iterator = None;
        def __str__(self):
            return '\n\t' + str(self.doc) + ', count:' + str(self.count) + self.lines.toString(',');
        def __lt__(self, other):
            return self.doc < other.doc;
        def __eq__(self, other):
            return self.doc  == other.doc;

        def insertLine(self, line):
            line = IndexList.Line(int(line));
            self.lines.insertSorted(line);

        def __iter__(self):
            return self;
        def next(self):
            if self._iterator is None:
                if self.lines._head is None: raise StopIteration
                self._iterator = self.lines._head;
                return self._iterator.value;
            self._iterator = self._iterator.next;
            if self._iterator is None: raise StopIteration
            return self._iterator.value;

    class Line():
        def __init__(self, line):
            self.line  = line;
        def __str__(self):
            return '\n\t\t' + str(self.line);
        def __lt__(self, other):
            return self.line < other.line;
        def __eq__(self, other):
            return self.line == other.line;