def _get_doc_info(self, doc_id): response = self.document_parser.ask( msg.build_request(method='get_doc', data={'doc_id': doc_id})) if response['status'] == 0: data = response['data'] return data['title'], data['text'], data['author'], data['date'] else: log.log_error("Could not get info for doc: {:}".format( response['error_msg']))
def _stem_word(self, word): response = self.document_parser.ask( msg.build_request(method='stem_word', data={'word': word})) if response['status'] == 0: return response['data']['stem'] else: log.log_error("Orchestrator could not stem word: {:}".format( response['error_msg'])) return ""
def _load_file(self, _file): try: log.log_info("DocumentParser.load_file parsing xml...") tree = ET.parse(_file) root = tree.getroot() log.log_info("Loading pages...") for page in root.findall('page'): page_id = int(page.find('id').text) log.log_info("Loading page: {:}".format(page_id)) page_data = self._parse_xml_page(page) self.index_handler.ask( msg.build_request(method='store_page', data={'page': page_data})) log.log_info("Done loading pages") self.parsed_docs.append(_file) except: log.log_error("DocumentParser.load_file error parsing xml") log.log_debug(traceback.format_exc())
def on_start(self): log.log_info("Starting Orchestrator...") try: log.log_info("Starting IndexHandler...") self.index_handler = IndexHandler.start() log.log_info("IndexHandler started") except: log.log_error("Could not start IndexHandler") log.log_debug(traceback.format_exc()) try: log.log_info("Starting DocumentParser...") self.document_parser = DocumentParser.start(self.index_handler) log.log_info("DocumentParser started") except: log.log_error("Could not start DocumentParser") log.log_debug(traceback.format_exc()) try: log.log_info("Starting QueryProcessor...") self.query_processor = QueryProcessor.start( self.document_parser, self.index_handler) log.log_info("QueryProcessor started") except: log.log_error("Could not start QueryProcessor") log.log_debug(traceback.format_exc()) log.log_info("Orchestrator started")
def _get_doc_info(self, doc_id): for f in self.parsed_docs: log.log_info("Getting info from doc: {:}".format(f)) tree = ET.parse(f) root = tree.getroot() for page in root.findall('page'): page_id = int(page.find('id').text) if page_id == doc_id: log.log_info("Found target page") title = page.find('title').text text = page.find('revision/text').text.encode( 'ascii', 'ignore') for c in text: if c in self.unwanted_chars: text = text.replace(c, ' ') author = "" if page.find('revision/contributor/username') is not None: author = page.find( 'revision/contributor/username').text elif page.find('revision/contributor/ip') is not None: author = page.find('revision/contributor/ip').text else: author = "Author" date = page.find('revision/timestamp').text return { 'title': title, 'text': text, 'author': author, 'date': date } log.log_error( "Cound not find document in corpus: {:}".format(doc_id)) return { 'title': "Title", 'text': "Text", 'author': "Author", 'date': "Date" }
def main(argv): log.flush_log() log.log_info("Hello") # process arguments document = None if '-d' in argv: document = argv[argv.index('-d') + 1] # launch orchestrator orchestrator = Orchestrator.start() if document: response = orchestrator.ask( msg.build_request(method='load_file', data={'file': document})) if response['status'] != 0: log.log_error(response['error_msg']) else: log.log_info("Loaded file") is_running = True while is_running: query = raw_input("Search: ") log.log_info("Query: {:}".format(query)) if query == "q" or query == "quit": is_running = False else: response = orchestrator.ask( msg.build_request(method='search', data={'query': query})) if response['status'] == 0: # log.log_info("Found word: {:}".format(response['data'])) print response['data'] else: log.log_error(response['error_msg']) orchestrator.stop() log.log_info("Goodbye")