def on_receive(self, message): log.log_info("DocumentParser received message: {:}".format(message)) if message['method'] == 'load_file': data = message['data'] if data['file']: self._load_file(data['file']) return msg.build_response(status=0) else: return msg.build_response( status=-1, error_msg="DocumentParser.load_file no file provided") elif message['method'] == 'stem_word': data = message['data'] if data['word']: stem = self._stem_word(data['word']) return msg.build_response(status=0, data={'stem': stem}) else: return msg.build_response( status=-1, error_msg="DocumentParser.stem_word no word provided") elif message['method'] == 'get_doc': data = message['data'] if data['doc_id']: info = self._get_doc_info(data['doc_id']) return msg.build_response(status=0, data=info) else: return msg.build_response( status=-1, error_msg="DocumentParser.get_doc no doc id provided") return msg.build_response( status=-13, error_msg="No method to process message: {:}".format(message))
def on_receive(self, message): log.log_info("Orchestrator received message: {:}".format(message)) if message['method'] == 'load_file': data = message['data'] if data['file']: response = self.document_parser.ask(message) if response['status'] == 0: return msg.build_response(status=0) else: return msg.build_response( status=-2, error_msg="Orchestrator.load_file failed: {:}".format( response['error_msg'])) else: return msg.build_response( status=-1, error_msg="No file provided to Orchestrator.load_file") elif message['method'] == 'search': data = message['data'] if data['query']: response = self.query_processor.ask( msg.build_request(method='query', data=message['data'])) if response['status'] == 0: return response else: return msg.build_response( status=-1, error_msg="No word provided to Orchestrator.search") return msg.build_response( status=-13, error_msg="No method to process message: {:}".format(message))
def _load_by_page(self, page): log.log_info("IndexHandler is adding a page...") for w in page.keys(): if w not in self.index.keys(): self.index[w] = {} for i in page[w].keys(): self.index[w][i] = page[w][i] # log.log_info("Added word: {:} document: {:} count: {:}".format(w, i, self.index[w][i])) log.log_info("Done adding page")
def on_stop(self): log.log_info("Stopping DocumentParser...") # log.log_info("Writing stop words to persistance file") # with open("stop-words.p", "w") as f: # pickle.dump(self.stop_words, f) # log.log_info("Stop words saved") log.log_info("DocumentParser stopped")
def on_receive(self, message): log.log_info("QueryProcessor received message: {:}".format(message)) if message['method'] == 'query': data = message['data'] if data['query']: return self._process_query(message['data']['query']) else: return msg.build_response( status=-1, error_msg="QueryProcessor.query no query provided") return msg.build_response( status=-13, error_msg="No method to process message: {:}".format(message))
def _beautify_result(self, data, word): # sort by term frequency # docs = data.keys() docs = sorted(data.items(), key=lambda x: x[1], reverse=True) log.log_info("Docs for query: {:}".format(docs)) # build string result result = "" count = 1 for d in docs: # get text, title, author, date for doc title, text, author, date = self._get_doc_info(d[0]) text = self._beautify_text(text, word) # format entry result += "[{:}]\t\033[94m{:}\033[0m - Doc: {:}\n\t{:} - {:}\n\t{:}\n".format( count, title, d[0], author, date, text) count += 1 return result
def on_start(self): # load stop words log.log_info("DocumentParser.on_start loading stop words") try: log.log_info("Reading from persistance file") with open("stop-words.p", "r") as f: self.stop_words = pickle.load(f) except: log.log_info("Could not read from persistance, rebuilding") with open("stop-words.txt", 'r') as f: for line in f: line = line.strip() self.stop_words.append(line) log.log_info("Stop words loaded")
def _get_doc_info(self, doc_id): for f in self.parsed_docs: log.log_info("Getting info from doc: {:}".format(f)) tree = ET.parse(f) root = tree.getroot() for page in root.findall('page'): page_id = int(page.find('id').text) if page_id == doc_id: log.log_info("Found target page") title = page.find('title').text text = page.find('revision/text').text.encode( 'ascii', 'ignore') for c in text: if c in self.unwanted_chars: text = text.replace(c, ' ') author = "" if page.find('revision/contributor/username') is not None: author = page.find( 'revision/contributor/username').text elif page.find('revision/contributor/ip') is not None: author = page.find('revision/contributor/ip').text else: author = "Author" date = page.find('revision/timestamp').text return { 'title': title, 'text': text, 'author': author, 'date': date } log.log_error( "Cound not find document in corpus: {:}".format(doc_id)) return { 'title': "Title", 'text': "Text", 'author': "Author", 'date': "Date" }
def on_receive(self, message): log.log_info("IndexHandler received message: {:}".format(message)) if message['method'] == 'store_page': data = message['data'] if data['page']: self._load_by_page(data['page']) return msg.build_response(status=0) else: return msg.build_response( status=-1, error_msg="IndexHandler.store_page no page provided") elif message['method'] == 'search': data = message['data'] if data['word']: result = self._search(data['word']) return msg.build_response(status=0, data=result) else: return msg.build_response( status=-1, error_msg="IndexHandler.search no word provided") return msg.build_response( status=-13, error_msg="No method to process message: {:}".format(message))
def _load_file(self, _file): try: log.log_info("DocumentParser.load_file parsing xml...") tree = ET.parse(_file) root = tree.getroot() log.log_info("Loading pages...") for page in root.findall('page'): page_id = int(page.find('id').text) log.log_info("Loading page: {:}".format(page_id)) page_data = self._parse_xml_page(page) self.index_handler.ask( msg.build_request(method='store_page', data={'page': page_data})) log.log_info("Done loading pages") self.parsed_docs.append(_file) except: log.log_error("DocumentParser.load_file error parsing xml") log.log_debug(traceback.format_exc())
def main(argv): log.flush_log() log.log_info("Hello") # process arguments document = None if '-d' in argv: document = argv[argv.index('-d') + 1] # launch orchestrator orchestrator = Orchestrator.start() if document: response = orchestrator.ask( msg.build_request(method='load_file', data={'file': document})) if response['status'] != 0: log.log_error(response['error_msg']) else: log.log_info("Loaded file") is_running = True while is_running: query = raw_input("Search: ") log.log_info("Query: {:}".format(query)) if query == "q" or query == "quit": is_running = False else: response = orchestrator.ask( msg.build_request(method='search', data={'query': query})) if response['status'] == 0: # log.log_info("Found word: {:}".format(response['data'])) print response['data'] else: log.log_error(response['error_msg']) orchestrator.stop() log.log_info("Goodbye")
def on_stop(self): log.log_info("Stopping QueryProcessor...") log.log_info("QueryProcessor stopped")
log.log_info("Loaded file") is_running = True while is_running: query = raw_input("Search: ") log.log_info("Query: {:}".format(query)) if query == "q" or query == "quit": is_running = False else: response = orchestrator.ask( msg.build_request(method='search', data={'query': query})) if response['status'] == 0: # log.log_info("Found word: {:}".format(response['data'])) print response['data'] else: log.log_error(response['error_msg']) orchestrator.stop() log.log_info("Goodbye") if __name__ == '__main__': try: main(sys.argv) except (KeyboardInterrupt): if orchestrator: orchestrator.stop() if response['status'] == 0: log.log_info("Cleanup successful") log.log_info("Goodbye")
def on_stop(self): log.log_info("Stopping Orchestrator...") self.document_parser.stop() self.query_processor.stop() self.index_handler.stop() log.log_info("Orchestrator stopped")
def on_start(self): log.log_info("Starting Orchestrator...") try: log.log_info("Starting IndexHandler...") self.index_handler = IndexHandler.start() log.log_info("IndexHandler started") except: log.log_error("Could not start IndexHandler") log.log_debug(traceback.format_exc()) try: log.log_info("Starting DocumentParser...") self.document_parser = DocumentParser.start(self.index_handler) log.log_info("DocumentParser started") except: log.log_error("Could not start DocumentParser") log.log_debug(traceback.format_exc()) try: log.log_info("Starting QueryProcessor...") self.query_processor = QueryProcessor.start( self.document_parser, self.index_handler) log.log_info("QueryProcessor started") except: log.log_error("Could not start QueryProcessor") log.log_debug(traceback.format_exc()) log.log_info("Orchestrator started")
def on_stop(self): log.log_info("Stopping IndexHandler...") log.log_info("IndexHandler stopped")