示例#1
0
    def on_receive(self, message):
        log.log_info("DocumentParser received message: {:}".format(message))
        if message['method'] == 'load_file':
            data = message['data']
            if data['file']:
                self._load_file(data['file'])
                return msg.build_response(status=0)
            else:
                return msg.build_response(
                    status=-1,
                    error_msg="DocumentParser.load_file no file provided")
        elif message['method'] == 'stem_word':
            data = message['data']
            if data['word']:
                stem = self._stem_word(data['word'])
                return msg.build_response(status=0, data={'stem': stem})
            else:
                return msg.build_response(
                    status=-1,
                    error_msg="DocumentParser.stem_word no word provided")
        elif message['method'] == 'get_doc':
            data = message['data']
            if data['doc_id']:
                info = self._get_doc_info(data['doc_id'])
                return msg.build_response(status=0, data=info)
            else:
                return msg.build_response(
                    status=-1,
                    error_msg="DocumentParser.get_doc no doc id provided")

        return msg.build_response(
            status=-13,
            error_msg="No method to process message: {:}".format(message))
    def on_receive(self, message):
        log.log_info("Orchestrator received message: {:}".format(message))
        if message['method'] == 'load_file':
            data = message['data']
            if data['file']:
                response = self.document_parser.ask(message)
                if response['status'] == 0:
                    return msg.build_response(status=0)
                else:
                    return msg.build_response(
                        status=-2,
                        error_msg="Orchestrator.load_file failed: {:}".format(
                            response['error_msg']))
            else:
                return msg.build_response(
                    status=-1,
                    error_msg="No file provided to Orchestrator.load_file")
        elif message['method'] == 'search':
            data = message['data']
            if data['query']:
                response = self.query_processor.ask(
                    msg.build_request(method='query', data=message['data']))
                if response['status'] == 0:
                    return response
            else:
                return msg.build_response(
                    status=-1,
                    error_msg="No word provided to Orchestrator.search")

        return msg.build_response(
            status=-13,
            error_msg="No method to process message: {:}".format(message))
示例#3
0
 def _load_by_page(self, page):
     log.log_info("IndexHandler is adding a page...")
     for w in page.keys():
         if w not in self.index.keys():
             self.index[w] = {}
         for i in page[w].keys():
             self.index[w][i] = page[w][i]
             # log.log_info("Added word: {:} document: {:} count: {:}".format(w, i, self.index[w][i]))
     log.log_info("Done adding page")
示例#4
0
    def on_stop(self):
        log.log_info("Stopping DocumentParser...")

        # log.log_info("Writing stop words to persistance file")
        # with open("stop-words.p", "w") as f:
        #     pickle.dump(self.stop_words, f)
        # log.log_info("Stop words saved")

        log.log_info("DocumentParser stopped")
示例#5
0
    def on_receive(self, message):
        log.log_info("QueryProcessor received message: {:}".format(message))
        if message['method'] == 'query':
            data = message['data']
            if data['query']:
                return self._process_query(message['data']['query'])
            else:
                return msg.build_response(
                    status=-1,
                    error_msg="QueryProcessor.query no query provided")

        return msg.build_response(
            status=-13,
            error_msg="No method to process message: {:}".format(message))
示例#6
0
    def _beautify_result(self, data, word):
        # sort by term frequency
        # docs = data.keys()
        docs = sorted(data.items(), key=lambda x: x[1], reverse=True)
        log.log_info("Docs for query: {:}".format(docs))

        # build string result
        result = ""
        count = 1
        for d in docs:
            # get text, title, author, date for doc
            title, text, author, date = self._get_doc_info(d[0])
            text = self._beautify_text(text, word)

            # format entry
            result += "[{:}]\t\033[94m{:}\033[0m - Doc: {:}\n\t{:} - {:}\n\t{:}\n".format(
                count, title, d[0], author, date, text)
            count += 1

        return result
示例#7
0
    def on_start(self):
        # load stop words
        log.log_info("DocumentParser.on_start loading stop words")
        try:
            log.log_info("Reading from persistance file")
            with open("stop-words.p", "r") as f:
                self.stop_words = pickle.load(f)
        except:
            log.log_info("Could not read from persistance, rebuilding")
            with open("stop-words.txt", 'r') as f:
                for line in f:
                    line = line.strip()
                    self.stop_words.append(line)

        log.log_info("Stop words loaded")
示例#8
0
 def _get_doc_info(self, doc_id):
     for f in self.parsed_docs:
         log.log_info("Getting info from doc: {:}".format(f))
         tree = ET.parse(f)
         root = tree.getroot()
         for page in root.findall('page'):
             page_id = int(page.find('id').text)
             if page_id == doc_id:
                 log.log_info("Found target page")
                 title = page.find('title').text
                 text = page.find('revision/text').text.encode(
                     'ascii', 'ignore')
                 for c in text:
                     if c in self.unwanted_chars:
                         text = text.replace(c, ' ')
                 author = ""
                 if page.find('revision/contributor/username') is not None:
                     author = page.find(
                         'revision/contributor/username').text
                 elif page.find('revision/contributor/ip') is not None:
                     author = page.find('revision/contributor/ip').text
                 else:
                     author = "Author"
                 date = page.find('revision/timestamp').text
                 return {
                     'title': title,
                     'text': text,
                     'author': author,
                     'date': date
                 }
         log.log_error(
             "Cound not find document in corpus: {:}".format(doc_id))
         return {
             'title': "Title",
             'text': "Text",
             'author': "Author",
             'date': "Date"
         }
示例#9
0
    def on_receive(self, message):
        log.log_info("IndexHandler received message: {:}".format(message))
        if message['method'] == 'store_page':
            data = message['data']
            if data['page']:
                self._load_by_page(data['page'])
                return msg.build_response(status=0)
            else:
                return msg.build_response(
                    status=-1,
                    error_msg="IndexHandler.store_page no page provided")
        elif message['method'] == 'search':
            data = message['data']
            if data['word']:
                result = self._search(data['word'])
                return msg.build_response(status=0, data=result)
            else:
                return msg.build_response(
                    status=-1,
                    error_msg="IndexHandler.search no word provided")

        return msg.build_response(
            status=-13,
            error_msg="No method to process message: {:}".format(message))
示例#10
0
 def _load_file(self, _file):
     try:
         log.log_info("DocumentParser.load_file parsing xml...")
         tree = ET.parse(_file)
         root = tree.getroot()
         log.log_info("Loading pages...")
         for page in root.findall('page'):
             page_id = int(page.find('id').text)
             log.log_info("Loading page: {:}".format(page_id))
             page_data = self._parse_xml_page(page)
             self.index_handler.ask(
                 msg.build_request(method='store_page',
                                   data={'page': page_data}))
         log.log_info("Done loading pages")
         self.parsed_docs.append(_file)
     except:
         log.log_error("DocumentParser.load_file error parsing xml")
         log.log_debug(traceback.format_exc())
示例#11
0
def main(argv):
    log.flush_log()
    log.log_info("Hello")

    # process arguments
    document = None
    if '-d' in argv:
        document = argv[argv.index('-d') + 1]

    # launch orchestrator
    orchestrator = Orchestrator.start()
    if document:
        response = orchestrator.ask(
            msg.build_request(method='load_file', data={'file': document}))
        if response['status'] != 0:
            log.log_error(response['error_msg'])
        else:
            log.log_info("Loaded file")

    is_running = True
    while is_running:
        query = raw_input("Search: ")
        log.log_info("Query: {:}".format(query))
        if query == "q" or query == "quit":
            is_running = False
        else:
            response = orchestrator.ask(
                msg.build_request(method='search', data={'query': query}))
            if response['status'] == 0:
                # log.log_info("Found word: {:}".format(response['data']))
                print response['data']
            else:
                log.log_error(response['error_msg'])

    orchestrator.stop()
    log.log_info("Goodbye")
示例#12
0
 def on_stop(self):
     log.log_info("Stopping QueryProcessor...")
     log.log_info("QueryProcessor stopped")
示例#13
0
            log.log_info("Loaded file")

    is_running = True
    while is_running:
        query = raw_input("Search: ")
        log.log_info("Query: {:}".format(query))
        if query == "q" or query == "quit":
            is_running = False
        else:
            response = orchestrator.ask(
                msg.build_request(method='search', data={'query': query}))
            if response['status'] == 0:
                # log.log_info("Found word: {:}".format(response['data']))
                print response['data']
            else:
                log.log_error(response['error_msg'])

    orchestrator.stop()
    log.log_info("Goodbye")


if __name__ == '__main__':
    try:
        main(sys.argv)
    except (KeyboardInterrupt):
        if orchestrator:
            orchestrator.stop()
            if response['status'] == 0:
                log.log_info("Cleanup successful")
        log.log_info("Goodbye")
示例#14
0
 def on_stop(self):
     log.log_info("Stopping Orchestrator...")
     self.document_parser.stop()
     self.query_processor.stop()
     self.index_handler.stop()
     log.log_info("Orchestrator stopped")
示例#15
0
    def on_start(self):
        log.log_info("Starting Orchestrator...")

        try:
            log.log_info("Starting IndexHandler...")
            self.index_handler = IndexHandler.start()
            log.log_info("IndexHandler started")
        except:
            log.log_error("Could not start IndexHandler")
            log.log_debug(traceback.format_exc())

        try:
            log.log_info("Starting DocumentParser...")
            self.document_parser = DocumentParser.start(self.index_handler)
            log.log_info("DocumentParser started")
        except:
            log.log_error("Could not start DocumentParser")
            log.log_debug(traceback.format_exc())

        try:
            log.log_info("Starting QueryProcessor...")
            self.query_processor = QueryProcessor.start(
                self.document_parser, self.index_handler)
            log.log_info("QueryProcessor started")
        except:
            log.log_error("Could not start QueryProcessor")
            log.log_debug(traceback.format_exc())

        log.log_info("Orchestrator started")
示例#16
0
 def on_stop(self):
     log.log_info("Stopping IndexHandler...")
     log.log_info("IndexHandler stopped")