def parse_xml_article(self, q): if 'bzip2' in (magic.from_file(self._article_xml_input_file).lower()): input_article_latest_xml = bz2.BZ2File(self._article_xml_input_file, 'r') else: input_article_latest_xml = open(self._article_xml_input_file) with open(self._article_xml_output, 'w') as output: print('<?xml version="1.0" ?>\n<root>', file=output) with input_article_latest_xml: tree = etree.iterparse(input_article_latest_xml, tag='{*}page', remove_comments=True) fast_iter(tree, self.process_element, output) q.put(self._total_articles) print("</root>", file=output)
def parse_xml_article(self, q): if 'bzip2' in (magic.from_file(self._article_xml_input_file).lower()): input_article_latest_xml = bz2.BZ2File( self._article_xml_input_file, 'r') else: input_article_latest_xml = open(self._article_xml_input_file) with open(self._article_xml_output, 'w') as output: print('<?xml version="1.0" ?>\n<root>', file=output) with input_article_latest_xml: tree = etree.iterparse(input_article_latest_xml, tag='{*}page', remove_comments=True) fast_iter(tree, self.process_element, output) q.put(self._total_articles) print("</root>", file=output)
def do_job(self): with open('temp_output', 'w') as output: print('<?xml version="1.0" ?>\n<root>', file=output) with open(self._pages) as page: tree_articles = etree.iterparse(page, tag='data', remove_comments=True, remove_blank_text=True) with open(self._cats) as cats: print('working on it') tree_link = etree.iterparse(cats, tag='category_trunc', remove_blank_text=True) dict_link = {} self._dick = {} full_name_dict_link = {} fast_iter(tree_link, self.gethash, dict_link, full_name_dict_link) self.indexFile() fast_iter(tree_articles, self.findcats, output, dict_link, full_name_dict_link, self._dick) print("<total>" + str(self._total_cats) + "</total>", file=output) print("<unique>" + str(self._unique) + "</unique>", file=output) print("<avg>" + str(self._unique / self._total_art) + "</avg>", file=output) print("<onetimeunique>" + str(len(self._dick)) + "</onetimeunique>", file=output) print("</root>", file=output) self._writer.close()