Пример #1
0
def fetch_og_metadata(user_agent, links):
    res = []
    for l in links:
        check_url(l)

        # Remove any AP actor from the list
        try:
            p = lookup(l)
            if p.has_type(ap.ACTOR_TYPES):
                continue
        except NotAnActivityError:
            pass

        r = requests.get(l, headers={"User-Agent": user_agent}, timeout=15)
        r.raise_for_status()
        if not r.headers.get("content-type").startswith("text/html"):
            logger.debug(f"skipping {l}")
            continue

        r.encoding = "UTF-8"
        html = r.text
        try:
            data = dict(opengraph.OpenGraph(html=html))
        except Exception:
            logger.exception(f"failed to parse {l}")
            continue
        if data.get("url"):
            res.append(data)

    return res
Пример #2
0
def fetch_og_metadata(user_agent, links):
    res = []
    for l in links:
        # Try to skip media early
        mimetype, _ = mimetypes.guess_type(l)
        if mimetype and mimetype.split("/")[0] in ["image", "video", "audio"]:
            logger.info(f"skipping media link {l}")
            continue

        check_url(l)

        # Remove any AP objects
        try:
            lookup(l)
            continue
        except NotAnActivityError:
            pass
        except Exception:
            logger.exception(
                f"skipping {l} because of issues during AP lookup")
            continue

        try:
            h = requests.head(l,
                              headers={"User-Agent": user_agent},
                              timeout=3,
                              allow_redirects=True)
            h.raise_for_status()
        except requests.HTTPError as http_err:
            logger.debug(
                f"failed to HEAD {l}, got a {http_err.response.status_code}: {http_err.response.text}"
            )
            continue
        except requests.RequestException as err:
            logger.debug(f"failed to HEAD {l}: {err!r}")
            continue

        if h.headers.get("content-type") and not h.headers.get(
                "content-type").startswith("text/html"):
            logger.debug(f"skipping {l} for bad content type")
            continue

        try:
            r = requests.get(l,
                             headers={"User-Agent": user_agent},
                             timeout=5,
                             allow_redirects=True)
            r.raise_for_status()
        except requests.HTTPError as http_err:
            logger.debug(
                f"failed to GET {l}, got a {http_err.response.status_code}: {http_err.response.text}"
            )
            continue
        except requests.RequestException as err:
            logger.debug(f"failed to GET {l}: {err!r}")
            continue

        # FIXME(tsileo): check mimetype via the URL too (like we do for images)
        if not r.headers.get("content-type") or not r.headers.get(
                "content-type").startswith("text/html"):
            continue

        r.encoding = "UTF-8"
        html = r.text
        try:
            data = dict(opengraph.OpenGraph(html=html))
        except Exception:
            logger.exception(f"failed to parse {l}")
            continue

        # Keep track of the fetched URL as some crappy websites use relative URLs everywhere
        data["_input_url"] = l
        u = urlparse(l)

        # If it's a relative URL, build the absolute version
        if "image" in data and data["image"].startswith("/"):
            data["image"] = u._replace(path=data["image"],
                                       params="",
                                       query="",
                                       fragment="").geturl()

        if "url" in data and data["url"].startswith("/"):
            data["url"] = u._replace(path=data["url"],
                                     params="",
                                     query="",
                                     fragment="").geturl()

        if data.get("url"):
            res.append(data)

    return res
Пример #3
0
def test_urlutils_check_url_helper():
    with pytest.raises(urlutils.InvalidURLError):
        urlutils.check_url("http://localhost:5000")