示例#1
0
    def do_index(self, cfg_file="index.cfg"):
        logging.info("Execution begin")

        cfg = ConfigReader.read_cfg(cfg_file)
        logging.info("Configuration file read")

        inv_list_file = cfg['LEIA'][0]
        model_file = cfg['ESCREVA'][0]

        inv_list = self.read_inv_list(inv_list_file)
        logging.info("Inverted list read: " + str(len(inv_list)) + " terms")

        self.weight_function.generate_model(inv_list)
        logging.info("Model generated")

        with open(model_file, 'wb') as pick_file:
            pickle.dump(self.weight_function, pick_file)
        logging.info("Model saved")

        logging.info("Execution ended")
    def process_queries(self, cfg_file_name='pc.cfg'):
        logging.info("Execution begin")

        configs = ConfigReader.read_cfg(cfg_file_name)
        logging.info("Configuration file read")

        begin = time.perf_counter()
        for xml_file_name in configs['LEIA']:
            xml_file = minidom.parse(xml_file_name)
            query_list = xml_file.getElementsByTagName('QUERY')
            for query in query_list:
                self._process_xml_query(query)
        end = time.perf_counter()
        elapsed = end - begin
        logging.info("Queries processed: " + str(len(self.expected_docs_by_query)) + " queries read from " + str(len(configs['LEIA'])) + " file(s)")
        logging.info("Query processor performance: " + str(len(self.expected_docs_by_query)/elapsed) + " queries per second.")

        self.write_queries(configs['CONSULTAS'][0], configs['ESPERADOS'][0])
        logging.info("Query processing saved")
        logging.info("Execution ended")
示例#3
0
    def do_search(self, config_file_name="busca.cfg"):
        logging.info("Execution begin")
        cfg = ConfigReader.read_cfg(config_file_name)
        logging.info("Configuration file read")

        model_file_name = cfg["MODELO"][0]
        queries_file_name = cfg["CONSULTAS"][0]
        results_file_name = cfg["RESULTADOS"][0]

        self.model = pickle.load(open(model_file_name, 'rb'))
        logging.info("Model loaded")

        queries = dict()
        with open(queries_file_name) as queries_file:
            for l in queries_file.readlines():
                temp = l.split(';')
                query_id = temp[0]
                query = temp[1]
                queries[query_id] = query

        n_queries = len(queries)
        logging.info("Queries file loaded: " + str(n_queries) + " loaded")

        query_results = dict()
        begin_time = time.perf_counter()
        for query_id in queries:
            similarities = self.model.retrieve(queries[query_id])
            query_results[query_id] = similarities

        end_time = time.perf_counter()
        elapsed_time = end_time - begin_time

        logging.info("Retrieval done")
        logging.info("Retrieval performance: " + str(n_queries/elapsed_time) + " queries per seconds")

        with open(results_file_name, 'w') as results_file:
            for query_id in query_results:
                self._write(query_id, query_results[query_id], results_file)

        logging.info("Queries results saved")
        logging.info("Execution ended")
    def parse_corpus(self, cfg_file):
        logging.info("Execution begin")

        configs = ConfigReader.read_cfg(cfg_file)
        logging.info("Configuration file read")

        corpus = dict()
        for in_file in configs["LEIA"]:
            document = self.read_xml(in_file)
            corpus.update(document)

        logging.info("Corpus read: " + str(len(corpus)) + " documents readed from " + str(len(configs["LEIA"])) + " files")

        begin = time.perf_counter()
        inv_list = self.generate_inverted_list(corpus)
        end = time.perf_counter()
        elapsed = end - begin

        logging.info("Inverted list generated: " + str(len(inv_list)) + " terms collected")
        logging.info("Inverted list performance: " + str(len(corpus)/elapsed) + " documents per second")

        self.write_inverted_list(inv_list, configs["ESCREVE"][0])
        logging.info("Inverted list saved")
        logging.info("Execution ended")