Exemplos de Parser em Python, exemplos de myparser.Parser em Python

Exemplo n.º 1

0

Exibir arquivo

 def test_hostnames(self):
     word = "qq.com"
     results = "@chuangshi.qq.com****test.im.qq.com**fskfhqefm,nfuq!@#$!^#$^&mail.qq.com"
     p = myparser.Parser(results, word)
     hostnames = sorted(p.hostnames())
     print hostnames
     self.assertEqual(hostnames,
                      ['chuangshi.qq.com', 'mail.qq.com', 'test.im.qq.com'])

Exemplo n.º 2

0

Exibir arquivo

Arquivo: googlesearch.py Projeto: sry309/Zeus

 def get_hostnames(self):
     raw_res = myparser.Parser(self.total_results, self.word)
     return raw_res.hostnames()

Exemplo n.º 3

0

Exibir arquivo

Arquivo: main.py Projeto: schamanegeist/domparser

def main():
    if len(sys.argv) < 3:
        print("Missing arguments. This program needs:\n"
              "\tA directory housing the original files; and\n"
              "\ta template for the new files.")
        exit()

    root = sys.argv[len(sys.argv) - 2]
    local_files = map(fully_qual(root), os.listdir(root))
    try:
        os.mkdir(os.path.join(root, "pages"))
    except FileExistsError:
        pass

    dirs = filter(os.path.isdir, map(fully_qual(root), os.listdir(root)))
    files = filter(os.path.isfile, filter(filter_html, local_files))

    # Parse template file into a root.
    hp = myparser.Parser(htmlelem.Element("", -1))
    hp.feed(io.open(sys.argv[len(sys.argv) - 1]).read())
    template_root = hp.root

    # Create new root and connect to HTML myparser.
    HTML_root = htmlelem.Element("", -1)
    hp.root = HTML_root

    for file in files:
        with io.open(file, mode="r", encoding="utf-8", errors="ignore") as f:
            hp.feed(f.read())

        HTML_root = hp.root

        try:
            template_root.adopt_child_into(
                HTML_root.search_attributes("id", "content")[0], "main")
        except IndexError:
            pass

        hp.reset()
        HTML_root.clear_children()

        with io.open(file,
                     mode="w",
                     newline="",
                     encoding="utf-8",
                     errors="ignore") as f:
            f.write(template_root.convert_to_string())

        template_root.search_tag("main")[0].clear_children()

    for dir in dirs:
        local_files = map(fully_qual(dir), os.listdir(dir))
        files = filter(os.path.isfile, filter(filter_html, local_files))

        local_dir = os.path.split(dir)[1]

        content_div = htmlelem.Element("div", 0, id="content")

        for file in files:
            with io.open(file, mode="r", encoding="utf-8",
                         errors="ignore") as f:
                hp.feed(f.read())

            HTML_root = hp.root
            article = htmlelem.Element("article", 0, id="")

            ids = ("ab", "bb", "bsi", "cankers", "csi", "fd", "fid", "iwp",
                   "mistletoes", "nid", "rd", "sap", "scrp", "sds", "wb")
            for id in ids:
                try:
                    article.adopt_child(
                        HTML_root.search_attributes("id", id + "-pg")[0])
                except IndexError:
                    continue

            try:
                article.adopt_child(
                    HTML_root.search_attributes("id", "content")[0])
            except IndexError:
                pass

            content_div.adopt_child(article)
            hp.reset()
            HTML_root.clear_children()

        template_root.adopt_child_into(content_div, "main")
        output = os.path.join(root, "pages", local_dir + "_index.shtml")
        with io.open(output,
                     mode="w+",
                     newline="",
                     encoding="utf-8",
                     errors="ignore") as f:
            f.write(template_root.convert_to_string())

        template_root.search_tag("main")[0].clear_children()