Python get_url_content示例，pyquickhelper.loghelper.get_url_content Python示例

示例#1

0

显示文件

文件： test_documentation_server.py 项目： sdpython/pyquickhelper

    def test_server_start_run(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")
        path = os.path.abspath(os.path.split(__file__)[0])
        data = os.path.join(path, "data")

        server = 'localhost'
        thread = run_doc_server(server, {"pyquickhelper": data},
                                True, port=8094)

        url = "http://localhost:8094/pyquickhelper/"
        cont = get_url_content(url)
        self.assertNotEmpty(cont)
        self.assertIn("GitHub/pyquickhelper</a>", cont)
        fLOG("-------")
        url = "http://localhost:8094/pyquickhelper/search.html?q=flog&check_keywords=yes&area=default"
        cont = get_url_content(url)
        self.assertNotEmpty(cont)
        self.assertIn("Please activate JavaScript to enable the search", cont)
        self.assertIn("http://sphinx.pocoo.org/", cont)

        cont = get_url_content(url, True)
        self.assertNotEmpty(cont)
        self.assertIn("Please activate JavaScript to enable the search", cont)
        self.assertIn("http://sphinx.pocoo.org/", cont)

        thread.shutdown()
        if thread.is_alive():
            fLOG("thread is still alive?", thread.is_alive())
            assert False

示例#2

0

显示文件

    def test_server_start_run(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")
        path = os.path.abspath(os.path.split(__file__)[0])
        data = os.path.join(path, "data")

        server = 'localhost'
        thread = run_doc_server(server, {"pyquickhelper": data},
                                True, port=8094)

        url = "http://localhost:8094/pyquickhelper/"
        cont = get_url_content(url)
        self.assertNotEmpty(cont)
        self.assertIn("GitHub/pyquickhelper</a>", cont)
        fLOG("-------")
        url = "http://localhost:8094/pyquickhelper/search.html?q=flog&check_keywords=yes&area=default"
        cont = get_url_content(url)
        self.assertNotEmpty(cont)
        self.assertIn("Please activate JavaScript to enable the search", cont)
        self.assertIn("http://sphinx.pocoo.org/", cont)

        cont = get_url_content(url, True)
        self.assertNotEmpty(cont)
        self.assertIn("Please activate JavaScript to enable the search", cont)
        self.assertIn("http://sphinx.pocoo.org/", cont)

        thread.shutdown()
        if thread.is_alive():
            fLOG("thread is still alive?", thread.is_alive())
            assert False

示例#3

0

显示文件

def get_elysee_speech_from_elysees(
        title, url="http://www.elysee.fr/chronologie/article/"):
    """
    retrieve the text from Elysées

    @param      title       title of the document
    @param      url         weebiste
    @return                 html page

    The function tries something like::

        url + title.replace(" ","-")
    """
    if title.startswith("http"):
        full = title
    else:
        if not url.endswith("/"):
            raise Exception("url should end with /: " + url)
        link = remove_accent(title.lower()).replace(
            " ", "-").replace("'", "-").replace('"', "")
        full = url + "/" + link + "/"
    try:
        text = get_url_content(full)
    except Exception as e:
        return None
        raise Exception(
            "unable to fetch content from: " +
            title +
            "\n" +
            full) from e
    return xmlParsingLongestDiv(text)

示例#4

0

显示文件

def get_elysee_speech_from_elysees(
        title, url="http://www.elysee.fr/chronologie/article/"):
    """
    retrieve the text from Elysées

    @param      title       title of the document
    @param      url         weebiste
    @return                 html page

    The function tries something like::

        url + title.replace(" ","-")
    """
    if title.startswith("http"):
        full = title
    else:
        if not url.endswith("/"):
            raise Exception("url should end with /: " + url)
        link = remove_accent(title.lower()).replace(" ", "-").replace(
            "'", "-").replace('"', "")
        full = url + "/" + link + "/"
    try:
        text = get_url_content(full)
    except Exception as e:
        return None
        raise Exception("unable to fetch content from: " + title + "\n" +
                        full) from e
    return xmlParsingLongestDiv(text)

示例#5

0

显示文件

文件： discours_politique.py 项目： zebazemir/ensae_teaching_cs

def get_elysee_speech_from_elysees(title, url="https://www.elysee.fr/"):
    """
    Retrieves the text from the :epkg:`Elysees`.

    @param      title       title of the document
    @param      url         website
    @return                 html page

    The function tries something like::

        url + title.replace(" ","-")
    """
    if title.startswith("http"):
        full = title
    else:
        if not url.endswith("/"):
            raise Exception("url should end with /: " + url)
        link = remove_accent(title.lower()).replace(" ", "-").replace(
            "'", "-").replace('"', "")
        full = url + "/" + link + "/"
    try:
        text = get_url_content(full)
    except Exception as e:
        warnings.warn("Unable to retrieve '{0}' - {1}".format(full, e))
        return None
    return xmlParsingLongestDiv(text)

示例#6

0

显示文件

文件： discours_politique.py 项目： sdpython/ensae_teaching_cs

def get_elysee_speech_from_elysees(title, url="https://www.elysee.fr/"):
    """
    Retrieves the text from the :epkg:`Elysees`.

    @param      title       title of the document
    @param      url         website
    @return                 html page

    The function tries something like::

        url + title.replace(" ","-")
    """
    if title.startswith("http"):
        full = title
    else:
        if not url.endswith("/"):
            raise Exception("url should end with /: " + url)
        link = remove_accent(title.lower()).replace(
            " ", "-").replace("'", "-").replace('"', "")
        full = url + "/" + link + "/"
    try:
        text = get_url_content(full)
    except Exception as e:
        warnings.warn("Unable to retrieve '{0}' - {1}".format(full, e))
        return None
    return xmlParsingLongestDiv(text)

示例#7

0

显示文件

    def test_flask_thread(self):
        """
        On Linux, this test fails unless the firewall
        is told to allow port 8025:

        ::

            sudo ufw allow 5000
            sudo ufw enable
        """
        app = create_application()
        th = FlaskInThread(app, host="localhost", port=8025)
        th.start()

        site = "http://localhost:8025/"

        # main page
        c = get_url_content(site)
        self.assertIn("Simple Flask Site", c)

        # exception
        c = get_url_content(site + "help/exception")
        self.assertIn("STACK:", c)

        # help for
        c = get_url_content(site + "help/ask/for/help")
        fLOG(c)
        self.assertIn("help for command: ask/for/help", c)

        # shutdown
        c = requests.post(site + "shutdown/")
        fLOG(c.text)
        self.assertIn("Server shutting down...", c.text)

        nb = 0
        while th.is_alive() and nb < 5:
            fLOG("waiting...", nb)
            time.sleep(1)
            nb += 1

        if th.is_alive():
            fLOG("thread is still alive (1)?", th.is_alive())
            assert False

示例#8

0

显示文件

文件： test_flask.py 项目： sdpython/ensae_teaching_cs

    def test_flask_thread(self):
        """
        On Linux, this test fails unless the firewall
        is told to allow port 8025:

        ::

            sudo ufw allow 5000
            sudo ufw enable
        """
        app = create_application()
        th = FlaskInThread(app, host="localhost", port=8025)
        th.start()

        site = "http://localhost:8025/"

        # main page
        c = get_url_content(site)
        self.assertIn("Simple Flask Site", c)

        # exception
        c = get_url_content(site + "help/exception")
        self.assertIn("STACK:", c)

        # help for
        c = get_url_content(site + "help/ask/for/help")
        fLOG(c)
        self.assertIn("help for command: ask/for/help", c)

        # shutdown
        c = requests.post(site + "shutdown/")
        fLOG(c.text)
        self.assertIn("Server shutting down...", c.text)

        nb = 0
        while th.is_alive() and nb < 5:
            fLOG("waiting...", nb)
            time.sleep(1)
            nb += 1

        if th.is_alive():
            fLOG("thread is still alive (1)?", th.is_alive())
            assert False

示例#9

0

显示文件

    def test_flask(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        if "travis" in sys.executable:
            # skip travis and Flask
            return

        th = FlaskInThread(app, host="localhost", port=8025)
        th.start()

        site = "http://localhost:8025/"

        # main page
        c = get_url_content(site)
        assert "Simple Flask Site"

        # exception
        c = get_url_content(site + "help/exception")
        assert "STACK:" in c

        # help for
        c = get_url_content(site + "help/ask/for/help")
        fLOG(c)
        assert "help for command: ask/for/help" in c

        # shutdown
        c = requests.post(site + "shutdown/")
        fLOG(c.text)
        assert "Server shutting down..." in c.text

        nb = 0
        while th.is_alive() and nb < 5:
            fLOG("waiting...", nb)
            time.sleep(1)
            nb += 1

        if th.is_alive():
            fLOG("thread is still alive (1)?", th.is_alive())
            assert False

示例#10

0

显示文件

    def test_flask(self):
        fLOG(__file__,
             self._testMethodName,
             OutputPrint=__name__ == "__main__")

        if "travis" in sys.executable:
            # skip travis and Flask
            return

        th = FlaskInThread(app, host="localhost", port=8025)
        th.start()

        site = "http://localhost:8025/"

        # main page
        c = get_url_content(site)
        assert "Simple Flask Site"

        # exception
        c = get_url_content(site + "help/exception")
        assert "STACK:" in c

        # help for
        c = get_url_content(site + "help/ask/for/help")
        fLOG(c)
        assert "help for command: ask/for/help" in c

        # shutdown
        c = requests.post(site + "shutdown/")
        fLOG(c.text)
        assert "Server shutting down..." in c.text

        nb = 0
        while th.is_alive() and nb < 5:
            fLOG("waiting...", nb)
            time.sleep(1)
            nb += 1

        if th.is_alive():
            fLOG("thread is still alive (1)?", th.is_alive())
            assert False

示例#11

0

显示文件

    def test_flask(self):
        fLOG(__file__,
             self._testMethodName,
             OutputPrint=__name__ == "__main__")

        if is_travis_or_appveyor() in ('travis', 'circleci'):
            # Get an error: urllib.error.URLError: <urlopen error [Errno 99] Cannot assign requested address>.
            return

        th = FlaskInThread(app, host="localhost", port=8025)
        th.start()

        site = "http://localhost:8025/"

        # main page
        c = get_url_content(site)
        self.assertIn("Simple Flask Site", c)

        # exception
        c = get_url_content(site + "help/exception")
        self.assertIn("STACK:", c)

        # help for
        c = get_url_content(site + "help/ask/for/help")
        fLOG(c)
        self.assertIn("help for command: ask/for/help", c)

        # shutdown
        c = requests.post(site + "shutdown/")
        fLOG(c.text)
        self.assertIn("Server shutting down...", c.text)

        nb = 0
        while th.is_alive() and nb < 5:
            fLOG("waiting...", nb)
            time.sleep(1)
            nb += 1

        if th.is_alive():
            fLOG("thread is still alive (1)?", th.is_alive())
            assert False

示例#12

0

显示文件

def enumerate_speeches_from_elysees(skip=0, use_json=False):
    """
    enumerates speeches Elysees Speeches

    @param      skip        skip the first one in the list
    @param      use_json    or json format or xml (json format is incomplete)
    @return                 enumerate dictionaries

    .. exref::
        :title: Récupérer des discours du président de la république
        :tag: Exercice

        ::

            for i,disc in enumerate(enumerate_speeches_from_elysees()):
                print(disc)

    """
    if use_json:
        url = "http://www.elysee.fr/chronologie/download/json"
        js = retrieve_speeches_json(url)
        for i, event in enumerate(js):
            if i < skip:
                continue
            items = event.get("items", None)
            title = event.get("title", None)
            if items is not None and title is not None and len(title) > 0:
                load = False
                for it in items:
                    if it is None:
                        continue
                    if not isinstance(it, dict):
                        continue
                    tit = it.get("title", "")
                    if tit is not None and "title" in it and "discours" in tit:
                        load = True
                        break
                if load:
                    content = get_elysee_speech_from_elysees(title)
                    if content is not None:
                        yield dict(text=content,
                                   title=title,
                                   date=event.get("date", None),
                                   description=event.get("description", None))
    else:
        url = "http://www.elysee.fr/chronologie/download/xml"
        xml = get_url_content(url)
        reg = re.compile("(http://.*?/article/.*?/)")
        links = reg.findall(xml)
        for i, link in enumerate(links):
            content = get_elysee_speech_from_elysees(link)
            if content is not None:
                yield dict(link=link, text=content)

示例#13

0

显示文件

def retrieve_speeches_json(
        url="http://www.elysee.fr/chronologie/download/json"):
    """
    retrieve the speeches from the Elysées

    @param      url     url
    @return             list of documents
    """
    text = get_url_content(url)
    stream = io.StringIO(text)
    js = json.load(stream)
    return js

示例#14

0

显示文件

def retrieve_speeches_json(
        url="http://www.elysee.fr/chronologie/download/json"):
    """
    retrieve the speeches from the Elysées

    @param      url     url
    @return             list of documents
    """
    text = get_url_content(url)
    stream = io.StringIO(text)
    js = json.load(stream)
    return js

示例#15

0

显示文件

def enumerate_speeches_from_elysees(skip=0, use_json=False):
    """
    enumerates speeches Elysees Speeches

    @param      skip        skip the first one in the list
    @param      use_json    or json format or xml (json format is incomplete)
    @return                 enumerate dictionaries

    .. exref::
        :title: Récupérer des discours du président de la république
        :tag: Exercice

        ::

            for i,disc in enumerate(enumerate_speeches_from_elysees()):
                print(disc)

    """
    if use_json:
        url = "http://www.elysee.fr/chronologie/download/json"
        js = retrieve_speeches_json(url)
        for i, event in enumerate(js):
            if i < skip:
                continue
            items = event.get("items", None)
            title = event.get("title", None)
            if items is not None and title is not None and len(title) > 0:
                load = False
                for it in items:
                    if it is None:
                        continue
                    if not isinstance(it, dict):
                        continue
                    tit = it.get("title", "")
                    if tit is not None and "title" in it and "discours" in tit:
                        load = True
                        break
                if load:
                    content = get_elysee_speech_from_elysees(title)
                    if content is not None:
                        yield dict(text=content,
                                   title=title,
                                   date=event.get("date", None),
                                   description=event.get("description", None))
    else:
        url = "http://www.elysee.fr/chronologie/download/xml"
        xml = get_url_content(url)
        reg = re.compile("(http://.*?/article/.*?/)")
        links = reg.findall(xml)
        for i, link in enumerate(links):
            content = get_elysee_speech_from_elysees(link)
            if content is not None:
                yield dict(link=link, text=content)

示例#16

0

显示文件

    def test_server_start_run(self):
        if sys.version_info[0] == 2:
            return
        if is_travis_or_appveyor() == "appveyor":
            return

        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")
        path = os.path.abspath(os.path.split(__file__)[0])
        data = os.path.join(path, "data")

        server = 'localhost'
        thread = run_doc_server(
            server, {
                "pyquickhelper": data}, True, port=8094)

        url = "http://localhost:8094/pyquickhelper/"
        cont = get_url_content(url)
        assert len(cont) > 0
        assert "GitHub/pyquickhelper</a>" in cont
        fLOG("-------")
        url = "http://localhost:8094/pyquickhelper/search.html?q=flog&check_keywords=yes&area=default"
        cont = get_url_content(url)
        assert len(cont) > 0
        assert "Please activate JavaScript to enable the search" in cont
        assert "http://sphinx.pocoo.org/" in cont

        cont = get_url_content(url, True)
        assert len(cont) > 0
        assert "Please activate JavaScript to enable the search" in cont
        assert "http://sphinx.pocoo.org/" in cont

        thread.shutdown()
        if thread.is_alive():
            fLOG("thread is still alive?", thread.is_alive())
            assert False

示例#17

0

显示文件

文件： discours_politique.py 项目： zebazemir/ensae_teaching_cs

def enumerate_speeches_from_elysees(url="agenda", skip=0):
    """
    Enumerates speeches from the :epkg:`Elysees`.

    @param      url         subaddress, url source will be
                            ``'https://www.elysee.fr/' + url``
    @param      skip        skip the first *skip* one in the list
    @return                 enumerate dictionaries

    .. exref::
        :title: Récupérer des discours du président de la république
        :tag: Exercice

        ::

            for i, disc in enumerate(enumerate_speeches_from_elysees()):
                print(disc)

    Others links can be used such as
    ``https://www.elysee.fr/recherche?query=discours``.
    The website changed in 2018 and no longer support xml or json
    streams.
    """
    base = "https://www.elysee.fr/"
    if not url.startswith("http"):
        url = base + url
    xml = get_url_content(url)
    reg = re.compile(
        "href=\\\"(.+?/[0-9]{4}/[0-9]{2}/[0-9]{2}/.+?)\\\" class=")
    links = reg.findall(xml)
    for i, link in enumerate(links):
        if i < skip:
            continue
        if link.startswith("/"):
            link = base + link
        content = get_elysee_speech_from_elysees(link)
        if content is not None:
            yield dict(link=link, text=content)
    if len(links) == 0:
        raise ValueError(
            "Unable to extract links from url='{0}'\npattern='{1}'\n-----\n{2}"
            .format(url, reg, xml))

示例#18

0

显示文件

文件： discours_politique.py 项目： sdpython/ensae_teaching_cs

def enumerate_speeches_from_elysees(url="agenda", skip=0):
    """
    Enumerates speeches from the :epkg:`Elysees`.

    @param      url         subaddress, url source will be
                            ``'https://www.elysee.fr/' + url``
    @param      skip        skip the first *skip* one in the list
    @return                 enumerate dictionaries

    .. exref::
        :title: Récupérer des discours du président de la république
        :tag: Exercice

        ::

            for i, disc in enumerate(enumerate_speeches_from_elysees()):
                print(disc)

    Others links can be used such as
    ``https://www.elysee.fr/recherche?query=discours``.
    The website changed in 2018 and no longer support xml or json
    streams.
    """
    base = "https://www.elysee.fr/"
    if not url.startswith("http"):
        url = base + url
    xml = get_url_content(url)
    reg = re.compile(
        "href=\\\"(.+?/[0-9]{4}/[0-9]{2}/[0-9]{2}/.+?)\\\" class=")
    links = reg.findall(xml)
    for i, link in enumerate(links):
        if i < skip:
            continue
        if link.startswith("/"):
            link = base + link
        content = get_elysee_speech_from_elysees(link)
        if content is not None:
            yield dict(link=link, text=content)
    if len(links) == 0:
        raise ValueError("Unable to extract links from url='{0}'\npattern='{1}'\n-----\n{2}".format(
            url, reg, xml))

示例#19

0

显示文件

文件： documentation_server.py 项目： xadupre/pyquickhelper

    def serve_content(self, cpath, method="GET"):
        """
        Tells what to do based on the path. The function intercepts the
        path /localfile/, otherwise it calls ``serve_content_web``.

        If you type ``http://localhost:8080/root/file``,
        assuming ``root`` is mapped to a local folder.
        It will display this file.

        @param      cpath        ParseResult
        @param      method      GET or POST
        """
        if cpath.path == "" or cpath.path == "/":
            params = parse_qs(cpath.query)
            self.serve_main_page()
        else:
            params = parse_qs(cpath.query)
            params["__path__"] = cpath

            # fullurl = cpath.geturl()
            fullfile = cpath.path
            params["__url__"] = cpath
            spl = fullfile.strip("/").split("/")

            project = spl[0]
            link = "/".join(spl[1:])
            value = DocumentationHandler.mappings.get(project, None)

            if value is None:
                self.LOG("can't serve", cpath)
                self.LOG("with params", params)
                self.send_response(404)
                #raise KeyError("unable to find a mapping associated to: " + project + "\nURL:\n" + url + "\nPARAMS:\n" + str(params))

            elif value == "shut://":
                self.LOG("call shutdown")
                self.shutdown()

            elif value == "http://":
                self.send_response(200)
                self.send_headers("debug.html")
                url = cpath.path.replace("/%s/" % project, "")
                try:
                    content = get_url_content(url)
                except Exception as e:  # pragma: no cover
                    content = "<html><body>ERROR (2): %s</body></html>" % e
                self.feed(content, False, params={})

            else:
                if ".." in link:
                    # we avoid that case to prevent users from digging others paths
                    # than the mapped ones, just in that the browser does not
                    # remove them
                    self.send_error(404)
                    self.feed("Requested resource %s unavailable" % link)
                else:
                    # we do not expect the documentation to point to the root
                    # it must be relative paths
                    localpath = link.lstrip("/")
                    if localpath in [None, "/", ""]:
                        localpath = "index.html"
                    fullpath = os.path.join(value, localpath)
                    self.LOG("localpath ", fullpath, os.path.isfile(fullpath))

                    self.send_response(200)
                    _, ftype = self.get_ftype(localpath)

                    execute = eval(params.get("execute", ["True"])[0])
                    spath = params.get("path", [None])[0]
                    # keep = eval(params.get("keep", ["False"])[0])

                    if ftype != 'execute' or not execute:
                        content = self.get_file_content(fullpath, ftype, spath)
                        if content is None:
                            self.LOG("** w,unable to get file for key:", spath)
                            self.send_error(404)
                            self.feed("Requested resource %s unavailable" %
                                      localpath)
                        else:
                            ext = os.path.splitext(localpath)[-1].lower()
                            if ext in [
                                    ".py", ".c", ".cpp", ".hpp", ".h", ".r",
                                    ".sql", ".java"
                            ]:
                                self.send_headers(".html")
                                self.feed(
                                    DocumentationHandler.html_code_renderer(
                                        localpath, content))
                            elif ext in [".html"]:
                                content = DocumentationHandler.process_html_path(
                                    project, content)
                                self.send_headers(localpath)
                                self.feed(content)
                            else:
                                self.send_headers(localpath)
                                self.feed(content)
                    else:
                        self.LOG("execute file ", localpath)
                        out, err = DocumentationHandler.execute(localpath)
                        if len(err) > 0:
                            self.send_error(404)
                            self.feed("Requested resource %s unavailable" %
                                      localpath)
                        else:
                            self.send_headers(localpath)
                            self.feed(out)