class Pepper(object): """ The Pepper Pots of UI (Public Relations) for Tony Stark. Handles the user inputting queries, parsing the queries, and returning results from the indexed corpus by Ironman """ def __init__(self, documents, NDC, stop_words): super(Pepper, self).__init__() self.documents = documents self.NDC = NDC self.p = PorterStemmer() self.stop_words = stop_words def handleQuery(self, user_input): """ Handles the process of formatting a user_inputted query """ scores = [] stem_query = self.p.stemText(user_input, self.stop_words).encode('utf_8', 'ignore') query = Document(stem_query, full_text=user_input) self.NDC.normalize(query) for document in self.documents: scores.append((self.NDC.score(query, document), document)) scores = sorted(scores, reverse=True) return scores def score(query, document): return 1
class Parser(object): """ The parsing workhorse of the entire project. """ def __init__(self, stop_words, **kwargs): """ The constructor for the Parser object. @stop_words could be one a list of stop words, or None """ super(Parser, self).__init__() # Checks if stop_words is a list if stop_words is not None: self.stop_words = [] for word in stop_words: self.stop_words.append(word.lower()) else: self.stop_words = None self.hashes = [] self.documents = [] self.num_duplicates = 0 self.p = PorterStemmer() def retrieveText(self, page_soup, url): """ Retrieves all the non-markup text from a webpage that has already been crawled. @page_soup: The soupified version of a webpage """ # Retrieve all the text of the page minus the html tags page_text = page_soup.get_text() # Stems and returns all the non-stopword text stem_text = self.p.stemText(page_text, self.stop_words).encode('utf_8', 'ignore') # Create a hash to make sure there are no 100% duplicates in the pages # The hex digest will also be used as the document ID, since they will # be unique unless they are a duplicate h = hashlib.md5() h.update(stem_text) page_hash = h.hexdigest() # If the page is not a duplicate, add the hash to a list of found # hashes, and create a Document object to keep track of the information # for each Document if page_hash not in self.hashes: self.hashes.append(page_hash) self.documents.append(Document(stem_text, page_text, url, page_hash)) else: self.num_duplicates += 1