def __init__(self, text, collection, stemmer=None, stopwordVocab=None): self.text = text self.collection = collection self.results = {} self.vocab = Vocabulary() self.maxFreq = 0 self.norm = None self.weights = None # Parse self.parser = UtteranceTextParser(stemmer, stopwordVocab) for word in self.parser.getWords(text): self.vocab.add(word) for word in self.getWordList(): if self.getWordCount(word) > self.maxFreq: self.maxFreq = self.getWordCount(word)
def main(): # ARGUMENT PARSING argParser = buildArguments() args = argParser.parse_args() print("----------") # STEMMING CREATION stemmer = None if args.stem: stemmer = PorterStemmer() # STOPWORD CREATION stopwordVocab = None if args.stopword_filename: stopwordList = None if args.stopword_filename[-4:] != '.txt': print('Wrong stopword file format: please provide a .txt file') return errno.EINVAL try: with open(args.stopword_filename) as raw_data_file: print('Building stopword vocabulary: %s' % (args.stopword_filename)) stopwordList = raw_data_file.readlines() except OSError as e: if e.errno == errno.ENOENT: print('Could not find stopword file %s' % (args.file)) return errno.ENOENT stopwordVocab = Vocabulary() for w in stopwordList: stopwordVocab.add(w.strip()) print('Done!') print("----------") # COLLECTION CREATION collection = None if args.filename[-5:] == '.json': try: with open(args.filename) as raw_data_file: print('Processing JSON Utterances file: %s' % (args.filename)) jsonData = None try: jsonData = json.load(raw_data_file) except ValueError as err: print("failed to open JSON file") return errno.EINVAL except OSError as e: if e.errno == errno.ENOENT: print('Could not find file %s' % (args.file)) return errno.ENOENT collection = UtteranceCollection(\ jsonData, stemmer=stemmer, stopwordVocab=stopwordVocab, \ dedup=args.dedup, metadata=args.metadata) print('Done!') print("----------") # PICKLE THE COLLECTION if args.pickle: try: pickleFilename = args.filename.replace(".","_") + ".pickle" print("Saving to file %s" % (pickleFilename)) with open(pickleFilename, "wb") as outFile: pickle.dump(collection, outFile) print("Save Successful!") print("----------") except OSError as e: if e.errno == errno.ENOENT: print('Could not open file %s' % (args.filename)) return errno.ENOENT elif args.filename[-7:] == '.pickle': # OPEN FROM PICKLED FILE print("Opening processed file %s" % (args.filename)) try: with open(args.filename) as pickleFile: collection = pickle.load(pickleFile) print("Load Successful!") print("----------") except: print("Something went very wrong") return 22 else: print('File issue, make sure you include a queryfile with the .pickle results') return errno.EINVAL # QUERYING if args.query_filename and args.query_filename[-4:] == '.txt': # TOP RESULTS topResultNum = int(args.top) if args.top else DEF_TOP queryText = None # OPEN QUERY FILE print("Opening query file: %s" % (args.query_filename)) try: with open(args.query_filename) as qfile: queryText = qfile.read().replace("\n", " ").strip() except OSError as e: if e.errno == errno.ENOENT: print('Could not find file %s' % (args.filename)) return errno.ENOENT else: raise query = Query(queryText, collection, stemmer=stemmer, stopwordVocab=stopwordVocab) print("Finding documents related to:") print('~' * WIDTH) dedented_text = textwrap.dedent(queryText).strip() print(textwrap.fill(dedented_text, initial_indent=' ', subsequent_indent=' ', width = 100)) query.findResults() print('~' * WIDTH) print("Finding top %s results" % topResultNum) print('~' * WIDTH) i = 1 for res in query.getResults(topResultNum): doc = res.getDocument() resultStr = "%-2d|%.3f\tpid:%d\tperson:%s" % \ (i, res.getSimilarity(), doc.pid, doc.PersonType) print(resultStr) print('~' * WIDTH) dedented_text = textwrap.dedent(doc.text).strip() print(textwrap.fill(dedented_text, initial_indent=' ' * INDENT, subsequent_indent=' ' * INDENT, width = WIDTH)) print('~' * WIDTH) i += 1 if args.word: print("----------") collection.printStatistics(args.word) print("----------") query.printStatistics(args.word) print("----------")
class Query(object): def __init__(self, text, collection, stemmer=None, stopwordVocab=None): self.text = text self.collection = collection self.results = {} self.vocab = Vocabulary() self.maxFreq = 0 self.norm = None self.weights = None # Parse self.parser = UtteranceTextParser(stemmer, stopwordVocab) for word in self.parser.getWords(text): self.vocab.add(word) for word in self.getWordList(): if self.getWordCount(word) > self.maxFreq: self.maxFreq = self.getWordCount(word) def __iter__(self): return self.vocab.__iter__() def addWord(self, word): self.vocab.add(word) def getWordCount(self, word): return self.vocab.getWordCount(word) def getWordList(self): return self.vocab.getWordList() # Term Weights def calculateNorm(self): sumSquares = 0.0 weights = self.getWeights() for w in weights: sumSquares += weights[w] * weights[w] self.norm = math.sqrt(sumSquares) def getNorm(self): if not self.norm: self.calculateNorm() return self.norm def calculateWeights(self): self.weights = {} for term in self.getWordList(): tf = self.getWordCount(term) idf = self.collection.inverseDocumentFrequency(term) self.weights[term] = tf * idf def getWeights(self): if not self.weights: self.calculateWeights() return self.weights def getTermWeight(self, term): weights = self.getWeights() if term not in weights: return 0 return weights[term] def printStatistics(self, string): for w in self.parser.getWords(string): print("Query Stats") print("Word: %s" % w) print("TF : %d" % self.getWordCount(w)) print("IDF : %.3f" % self.collection.inverseDocumentFrequency(w)) print("WGHT: %.3f" % self.getTermWeight(w)) # Results def findResults(self): self.results = [] for doc in self.collection.getDocuments(): result = QueryResult(doc, self) bisect.insort(self.results, result) def getResults(self, topK=None): if not self.results: self.findResults() if topK: return self.results[:topK] return self.results