def __parse(self, preprocess) : f = open(self.filename, "rb") xml_data = f.read() f.close() if preprocess : xml_data = xml_data.decode("utf8", "replace") xml_data = strip_xml(xml_data) xml_data = xml_data.encode("utf8") end_pattern = "</blog_backup>" find_index = my_rfind(xml_data, end_pattern) end_index = find_index + len(end_pattern) xml_data = xml_data[0:end_index] tree = ET.fromstring(xml_data) category_nodes = tree.findall("blog_articles_categories/category") article_nodes = tree.findall("*/article") # Wretch uses blog_articles_comments_%d%d instead of blog_articles_comments now # where %d%d is 2-digit number. This correctly find all comments comment_nodes = tree.findall("*/article_comment") aid_map = {} cid_name = {} blogdata = BlogData() #Todo: error handling for node in category_nodes : cid = node.findtext("id") cname = node.findtext("name") cid_name[cid] = cname for node in article_nodes: article = Article() article.author = node.findtext("userid") article.title = node.findtext("title") article.date = self.parse_date(node.findtext("date")) #In wretch, every article has only 1 category cid = node.findtext("category_id") if cid in cid_name : article.category.append(cid_name[cid]) if node.findtext("isCloak") == "0" : article.status = Article.PUBLISH else : article.status = Article.PRIVATE article.allow_comments = True article.allow_pings = True article.body = node.findtext("text") aid = node.findtext("id") aid_map[aid] = article blogdata.articles.append(article) for node in comment_nodes: comment = Comment() comment.author = aid = node.findtext("name") comment.email = node.findtext("email") comment.url = node.findtext("url") comment.date = datetime.strptime(node.findtext("date"), "%Y-%m-%d %H:%M:%S") comment.body = aid = node.findtext("text") comment.host = node.findtext("blog_id") comment.reply = node.findtext("reply") if comment.reply != "" : comment.rdate = datetime.strptime(node.findtext("reply_date"), "%Y-%m-%d %H:%M:%S") aid = node.findtext("article_id") try : article = aid_map[aid] article.comments.append(comment) except KeyError : print "Comment %s missing article %s" % (cid, aid) #TODO: process category return blogdata
def __parse(self, preprocess): f = open(self.filename, "rb") xml_data = f.read() f.close() if preprocess: xml_data = xml_data.decode("utf8", "replace") xml_data = strip_xml(xml_data) xml_data = xml_data.encode("utf8") end_pattern = "</blog_backup>" find_index = my_rfind(xml_data, end_pattern) end_index = find_index + len(end_pattern) xml_data = xml_data[0:end_index] tree = ET.fromstring(xml_data) category_nodes = tree.findall("blog_articles_categories/category") article_nodes = tree.findall("*/article") # Wretch uses blog_articles_comments_%d%d instead of blog_articles_comments now # where %d%d is 2-digit number. This correctly find all comments comment_nodes = tree.findall("*/article_comment") aid_map = {} cid_name = {} blogdata = BlogData() #Todo: error handling for node in category_nodes: cid = node.findtext("id") cname = node.findtext("name") cid_name[cid] = cname for node in article_nodes: article = Article() article.author = node.findtext("userid") article.title = node.findtext("title") article.date = self.parse_date(node.findtext("date")) #In wretch, every article has only 1 category cid = node.findtext("category_id") if cid in cid_name: article.category.append(cid_name[cid]) if node.findtext("isCloak") == "0": article.status = Article.PUBLISH else: article.status = Article.PRIVATE article.allow_comments = True article.allow_pings = True article.body = node.findtext("text") aid = node.findtext("id") aid_map[aid] = article blogdata.articles.append(article) for node in comment_nodes: comment = Comment() comment.author = aid = node.findtext("name") comment.email = node.findtext("email") comment.url = node.findtext("url") comment.date = datetime.strptime(node.findtext("date"), "%Y-%m-%d %H:%M:%S") comment.body = aid = node.findtext("text") aid = node.findtext("article_id") try: article = aid_map[aid] article.comments.append(comment) except KeyError: print "Comment %s missing article %s" % (cid, aid) #TODO: process category return blogdata
def test_strip(self): for i in xrange(0, 20): str = self._gen_string() self.assertEqual(strip_xml(str), old_strip_xml(str))
def test_strip(self) : for i in xrange(0, 20) : str = self._gen_string() self.assertEqual( strip_xml(str), old_strip_xml(str) )