Python strip_xml示例，blogtrans.util.XMLStripper.strip_xml Python示例

示例#1

0

显示文件

文件： WretchImporter.py 项目： Leeheng86/blogtrans

    def __parse(self, preprocess) :

        f = open(self.filename, "rb")
        xml_data = f.read()
        f.close()

        if preprocess :
            xml_data = xml_data.decode("utf8", "replace")
            xml_data = strip_xml(xml_data)
            xml_data = xml_data.encode("utf8")

        end_pattern = "</blog_backup>"
        find_index = my_rfind(xml_data, end_pattern)

        end_index = find_index + len(end_pattern)

        xml_data = xml_data[0:end_index]
        tree = ET.fromstring(xml_data)

        category_nodes = tree.findall("blog_articles_categories/category")
        article_nodes = tree.findall("*/article")

        # Wretch uses blog_articles_comments_%d%d instead of blog_articles_comments now
        # where %d%d is 2-digit number. This correctly find all comments

        comment_nodes = tree.findall("*/article_comment")

        aid_map = {}
        cid_name = {}

        blogdata = BlogData()

        #Todo: error handling
        for node in category_nodes :
            cid = node.findtext("id")
            cname = node.findtext("name")
            cid_name[cid] = cname

        for node in article_nodes:
            article = Article()

            article.author = node.findtext("userid")
            article.title = node.findtext("title")
            article.date = self.parse_date(node.findtext("date"))

            #In wretch, every article has only 1 category
            cid = node.findtext("category_id")
            if cid in cid_name :
                article.category.append(cid_name[cid])

            if node.findtext("isCloak") == "0" :
                article.status = Article.PUBLISH
            else :
                article.status = Article.PRIVATE

            article.allow_comments = True
            article.allow_pings = True

            article.body = node.findtext("text")

            aid = node.findtext("id")
            aid_map[aid] = article
            blogdata.articles.append(article)

        for node in comment_nodes:
            comment = Comment()

            comment.author = aid = node.findtext("name")
            comment.email = node.findtext("email")
            comment.url = node.findtext("url")
            comment.date = datetime.strptime(node.findtext("date"), "%Y-%m-%d %H:%M:%S")
            comment.body = aid = node.findtext("text")
            comment.host = node.findtext("blog_id")
            comment.reply = node.findtext("reply")
            if comment.reply != "" :
                comment.rdate = datetime.strptime(node.findtext("reply_date"), "%Y-%m-%d %H:%M:%S")

            aid = node.findtext("article_id")
            try :
                article = aid_map[aid]
                article.comments.append(comment)
            except KeyError :
                print "Comment %s missing article %s" % (cid, aid)

        #TODO: process category
        return blogdata

示例#2

0

显示文件

    def __parse(self, preprocess):

        f = open(self.filename, "rb")
        xml_data = f.read()
        f.close()

        if preprocess:
            xml_data = xml_data.decode("utf8", "replace")
            xml_data = strip_xml(xml_data)
            xml_data = xml_data.encode("utf8")

        end_pattern = "</blog_backup>"
        find_index = my_rfind(xml_data, end_pattern)

        end_index = find_index + len(end_pattern)

        xml_data = xml_data[0:end_index]
        tree = ET.fromstring(xml_data)

        category_nodes = tree.findall("blog_articles_categories/category")
        article_nodes = tree.findall("*/article")

        # Wretch uses blog_articles_comments_%d%d instead of blog_articles_comments now
        # where %d%d is 2-digit number. This correctly find all comments

        comment_nodes = tree.findall("*/article_comment")

        aid_map = {}
        cid_name = {}

        blogdata = BlogData()

        #Todo: error handling
        for node in category_nodes:
            cid = node.findtext("id")
            cname = node.findtext("name")
            cid_name[cid] = cname

        for node in article_nodes:
            article = Article()

            article.author = node.findtext("userid")
            article.title = node.findtext("title")
            article.date = self.parse_date(node.findtext("date"))

            #In wretch, every article has only 1 category
            cid = node.findtext("category_id")
            if cid in cid_name:
                article.category.append(cid_name[cid])

            if node.findtext("isCloak") == "0":
                article.status = Article.PUBLISH
            else:
                article.status = Article.PRIVATE

            article.allow_comments = True
            article.allow_pings = True

            article.body = node.findtext("text")

            aid = node.findtext("id")
            aid_map[aid] = article
            blogdata.articles.append(article)

        for node in comment_nodes:
            comment = Comment()

            comment.author = aid = node.findtext("name")
            comment.email = node.findtext("email")
            comment.url = node.findtext("url")
            comment.date = datetime.strptime(node.findtext("date"),
                                             "%Y-%m-%d %H:%M:%S")
            comment.body = aid = node.findtext("text")

            aid = node.findtext("article_id")
            try:
                article = aid_map[aid]
                article.comments.append(comment)
            except KeyError:
                print "Comment %s missing article %s" % (cid, aid)

        #TODO: process category
        return blogdata

示例#3

0

显示文件

文件： StripXMLTest.py 项目： peicheng/blogtrans

    def test_strip(self):

        for i in xrange(0, 20):
            str = self._gen_string()
            self.assertEqual(strip_xml(str), old_strip_xml(str))

示例#4

0

显示文件

文件： StripXMLTest.py 项目： Leeheng86/blogtrans

    def test_strip(self) :

        for i in xrange(0, 20) :
            str = self._gen_string()
            self.assertEqual( strip_xml(str), old_strip_xml(str) )