Exemplo n.º 1
0
    def parse_xml_article(self, q):
        if 'bzip2' in (magic.from_file(self._article_xml_input_file).lower()):
            input_article_latest_xml = bz2.BZ2File(self._article_xml_input_file, 'r')
        else:
            input_article_latest_xml = open(self._article_xml_input_file)

        with open(self._article_xml_output, 'w') as output:
            print('<?xml version="1.0" ?>\n<root>', file=output)

            with input_article_latest_xml:
                tree = etree.iterparse(input_article_latest_xml, tag='{*}page', remove_comments=True)
                fast_iter(tree, self.process_element, output)

            q.put(self._total_articles)
            print("</root>", file=output)
Exemplo n.º 2
0
    def parse_xml_article(self, q):
        if 'bzip2' in (magic.from_file(self._article_xml_input_file).lower()):
            input_article_latest_xml = bz2.BZ2File(
                self._article_xml_input_file, 'r')
        else:
            input_article_latest_xml = open(self._article_xml_input_file)

        with open(self._article_xml_output, 'w') as output:
            print('<?xml version="1.0" ?>\n<root>', file=output)

            with input_article_latest_xml:
                tree = etree.iterparse(input_article_latest_xml,
                                       tag='{*}page',
                                       remove_comments=True)
                fast_iter(tree, self.process_element, output)

            q.put(self._total_articles)
            print("</root>", file=output)
Exemplo n.º 3
0
    def do_job(self):
         with open('temp_output', 'w') as output:
             print('<?xml version="1.0" ?>\n<root>', file=output)
             with open(self._pages) as page:
                 tree_articles = etree.iterparse(page, tag='data', remove_comments=True, remove_blank_text=True)
                 with open(self._cats) as cats:
                     print('working on it')


                     tree_link = etree.iterparse(cats, tag='category_trunc', remove_blank_text=True)
                     dict_link = {}
                     self._dick = {}
                     full_name_dict_link = {}
                     fast_iter(tree_link, self.gethash, dict_link, full_name_dict_link)
                     self.indexFile()
                     fast_iter(tree_articles, self.findcats, output, dict_link, full_name_dict_link, self._dick)
                     print("<total>" + str(self._total_cats) + "</total>", file=output)
                     print("<unique>" + str(self._unique) + "</unique>", file=output)
                     print("<avg>" + str(self._unique / self._total_art) + "</avg>", file=output)
                     print("<onetimeunique>" + str(len(self._dick)) + "</onetimeunique>", file=output)

                     print("</root>", file=output)
                     self._writer.close()