class Doc(): def __init__(self, doc): self.doc = doc; self.lines = LinkedList(); self.count = 1; self._iterator = None; self.documentFrequency = None; def reset(self): self.lines.reset(); self._iterator = None; def __str__(self): return '\n\t' + str(self.doc) + ', count:' + str(self.count) + self.lines.toString(','); def __lt__(self, other): return self.doc < other.doc; def __eq__(self, other): return self.doc == other.doc; def insertLine(self, line): line = IndexList.Line(int(line)); self.lines.insertSorted(line); def __iter__(self): return self; def next(self): if self._iterator is None: if self.lines._head is None: raise StopIteration self._iterator = self.lines._head; return self._iterator.value; self._iterator = self._iterator.next; if self._iterator is None: raise StopIteration return self._iterator.value;
def search(self, query, queryExpansionParameter): orderedQueryResults = self.tfidf(query); if orderedQueryResults is None: return None; mostRelevantDocumentId = orderedQueryResults.getHead().docId; mostRelevantDocumentIdDocumentCollection = int(self._documentIds[mostRelevantDocumentId]); orderedExpansionTerms = LinkedList(); orderedExpansionTerms.printSep = "\n"; uniqueTerms = {}; currentDoc = self._docCollect.nextDocument(); currentDocId = int(currentDoc.getName()); while currentDocId != mostRelevantDocumentIdDocumentCollection: currentDoc = self._docCollect.nextDocument(); currentDocId = int(currentDoc.getName()); # current doc now has the document that is the highest ranked tfid result self._tokenizer.loadDocument(currentDoc); # generate unique query term set token, position = self._tokenizer.nextToken() while token is not None: uniqueTerms[token] = position; token, position = self._tokenizer.nextToken(); # calculate tfidf scores for each token and insert into ordered linked list for token, pos in uniqueTerms.iteritems(): tfidf = self.tfidf_term(token, mostRelevantDocumentIdDocumentCollection); orderedExpansionTerms.insertSorted(QueryResultItem(token, tfidf)); # expand query with most relevant query terms: currentNode = orderedExpansionTerms._head; for i in xrange(0,queryExpansionParameter): currentData = currentNode.value.docId; query += " " + currentData; currentNode = currentNode.next; # perform search with appended query results = self.tfidf(query); current = results._head; rank = 0; print("\n\nComplete query:"); print(query) print("results:\n") while current is not None: print str(self._currentQueryId) + " 0 " + str(current.value.docId) + " " + str(rank) + " " + str(current.value.tfidf) + " 0"; current = current.next; rank += 1; self._currentQueryId += 1;
class Term(): def __init__(self, term): self.term = term; self.docs = LinkedList(); self.count = 1; self._iterator = None; self._temp = None; def __str__(self): return "Term: " + self.term + ', count:' + str(self.count) + self.docs.toString(','); def __lt__(self, other): return self.term < other.term; def __eq__(self, other): return self.term == other.term; def reset(self): self.docs.reset(); self._iterator = None; def insertDoc(self, doc, line): obj = IndexList.Doc(doc); current = self.docs.getLastItemOf(obj); if current: current.count += 1; else: self.docs.insertSorted(obj); current = obj; current.insertLine(line); def __iter__(self): return self; def next(self): if self._temp is None: if self.terms is None: raise StopIteration self._temp = self.terms; if self._temp._currentNode is None: raise StopIteration value = self._temp._currentNode.value; self._temp._currentNode = self._temp._currentNode.next; return value; def __iter__(self): return self; def next(self): if self._iterator is None: if self.docs._head is None: raise StopIteration self._iterator = self.docs._head; return self._iterator.value; self._iterator = self._iterator.next; if self._iterator is None: raise StopIteration return self._iterator.value;
def tfidf(self, query): queryTokens = self._queryTokenizer.tokenize(query); postingLists = self._getPostingLists(queryTokens); documentScores = {}; # create document scores for term in postingLists: if term is None: return; for doc in term: termFrequency = 1 + math.log10(float(doc.count) / float(self.termFrequencies[int(doc.doc) - 1])) docSocre = termFrequency * self._getIDF(term); if str(doc.doc) in documentScores.keys(): documentScores[str(doc.doc)] += docSocre; else: documentScores[str(doc.doc)] = docSocre; # create an ordered list of documents, ordered by document score orderedQueryResults = LinkedList(); orderedQueryResults.printSep = "\n"; for docId, ds in documentScores.iteritems(): orderedQueryResults.insertSorted(QueryResultItem(docId, ds)); return orderedQueryResults
class IndexList(): def __init__(self): self.objs = LinkedList(); self._iterator = None; def __str__(self): return self.objs.toString('\n'); def reset(self): self.objs.reset(); self._iterator = None; def insert(self, string1, string2, string3): obj = IndexList.Term(string1); current = self.objs.getLastItemOf(obj); if current: current.count += 1; else: self.objs.insertSorted(obj); current = obj; current.insertDoc(string2, string3); return current; def __iter__(self): return self; def next(self): if self._iterator is None: if self.objs._head is None: raise StopIteration self._iterator = self.objs._head; return self._iterator.value; self._iterator = self._iterator.next; if self._iterator is None: raise StopIteration return self._iterator.value; class Term(): def __init__(self, term): self.term = term; self.docs = LinkedList(); self.count = 1; self._iterator = None; self._temp = None; def __str__(self): return "Term: " + self.term + ', count:' + str(self.count) + self.docs.toString(','); def __lt__(self, other): return self.term < other.term; def __eq__(self, other): return self.term == other.term; def reset(self): self.docs.reset(); self._iterator = None; def insertDoc(self, doc, line): obj = IndexList.Doc(doc); current = self.docs.getLastItemOf(obj); if current: current.count += 1; else: self.docs.insertSorted(obj); current = obj; current.insertLine(line); def __iter__(self): return self; def next(self): if self._temp is None: if self.terms is None: raise StopIteration self._temp = self.terms; if self._temp._currentNode is None: raise StopIteration value = self._temp._currentNode.value; self._temp._currentNode = self._temp._currentNode.next; return value; def __iter__(self): return self; def next(self): if self._iterator is None: if self.docs._head is None: raise StopIteration self._iterator = self.docs._head; return self._iterator.value; self._iterator = self._iterator.next; if self._iterator is None: raise StopIteration return self._iterator.value; class Doc(): def __init__(self, doc): self.doc = doc; self.lines = LinkedList(); self.count = 1; self._iterator = None; self.documentFrequency = None; def reset(self): self.lines.reset(); self._iterator = None; def __str__(self): return '\n\t' + str(self.doc) + ', count:' + str(self.count) + self.lines.toString(','); def __lt__(self, other): return self.doc < other.doc; def __eq__(self, other): return self.doc == other.doc; def insertLine(self, line): line = IndexList.Line(int(line)); self.lines.insertSorted(line); def __iter__(self): return self; def next(self): if self._iterator is None: if self.lines._head is None: raise StopIteration self._iterator = self.lines._head; return self._iterator.value; self._iterator = self._iterator.next; if self._iterator is None: raise StopIteration return self._iterator.value; class Line(): def __init__(self, line): self.line = line; def __str__(self): return '\n\t\t' + str(self.line); def __lt__(self, other): return self.line < other.line; def __eq__(self, other): return self.line == other.line;