def __init__(self, text, collection, stemmer=None, stopwordVocab=None): self.text = text self.collection = collection self.results = {} self.vocab = Vocabulary() self.maxFreq = 0 self.norm = None self.weights = None # Parse self.parser = UtteranceTextParser(stemmer, stopwordVocab) for word in self.parser.getWords(text): self.vocab.add(word) for word in self.getWordList(): if self.getWordCount(word) > self.maxFreq: self.maxFreq = self.getWordCount(word)
class Query(object): def __init__(self, text, collection, stemmer=None, stopwordVocab=None): self.text = text self.collection = collection self.results = {} self.vocab = Vocabulary() self.maxFreq = 0 self.norm = None self.weights = None # Parse self.parser = UtteranceTextParser(stemmer, stopwordVocab) for word in self.parser.getWords(text): self.vocab.add(word) for word in self.getWordList(): if self.getWordCount(word) > self.maxFreq: self.maxFreq = self.getWordCount(word) def __iter__(self): return self.vocab.__iter__() def addWord(self, word): self.vocab.add(word) def getWordCount(self, word): return self.vocab.getWordCount(word) def getWordList(self): return self.vocab.getWordList() # Term Weights def calculateNorm(self): sumSquares = 0.0 weights = self.getWeights() for w in weights: sumSquares += weights[w] * weights[w] self.norm = math.sqrt(sumSquares) def getNorm(self): if not self.norm: self.calculateNorm() return self.norm def calculateWeights(self): self.weights = {} for term in self.getWordList(): tf = self.getWordCount(term) idf = self.collection.inverseDocumentFrequency(term) self.weights[term] = tf * idf def getWeights(self): if not self.weights: self.calculateWeights() return self.weights def getTermWeight(self, term): weights = self.getWeights() if term not in weights: return 0 return weights[term] def printStatistics(self, string): for w in self.parser.getWords(string): print("Query Stats") print("Word: %s" % w) print("TF : %d" % self.getWordCount(w)) print("IDF : %.3f" % self.collection.inverseDocumentFrequency(w)) print("WGHT: %.3f" % self.getTermWeight(w)) # Results def findResults(self): self.results = [] for doc in self.collection.getDocuments(): result = QueryResult(doc, self) bisect.insort(self.results, result) def getResults(self, topK=None): if not self.results: self.findResults() if topK: return self.results[:topK] return self.results