def fetch_og_metadata(user_agent, links): res = [] for l in links: check_url(l) # Remove any AP actor from the list try: p = lookup(l) if p.has_type(ap.ACTOR_TYPES): continue except NotAnActivityError: pass r = requests.get(l, headers={"User-Agent": user_agent}, timeout=15) r.raise_for_status() if not r.headers.get("content-type").startswith("text/html"): logger.debug(f"skipping {l}") continue r.encoding = "UTF-8" html = r.text try: data = dict(opengraph.OpenGraph(html=html)) except Exception: logger.exception(f"failed to parse {l}") continue if data.get("url"): res.append(data) return res
def fetch_og_metadata(user_agent, links): res = [] for l in links: # Try to skip media early mimetype, _ = mimetypes.guess_type(l) if mimetype and mimetype.split("/")[0] in ["image", "video", "audio"]: logger.info(f"skipping media link {l}") continue check_url(l) # Remove any AP objects try: lookup(l) continue except NotAnActivityError: pass except Exception: logger.exception( f"skipping {l} because of issues during AP lookup") continue try: h = requests.head(l, headers={"User-Agent": user_agent}, timeout=3, allow_redirects=True) h.raise_for_status() except requests.HTTPError as http_err: logger.debug( f"failed to HEAD {l}, got a {http_err.response.status_code}: {http_err.response.text}" ) continue except requests.RequestException as err: logger.debug(f"failed to HEAD {l}: {err!r}") continue if h.headers.get("content-type") and not h.headers.get( "content-type").startswith("text/html"): logger.debug(f"skipping {l} for bad content type") continue try: r = requests.get(l, headers={"User-Agent": user_agent}, timeout=5, allow_redirects=True) r.raise_for_status() except requests.HTTPError as http_err: logger.debug( f"failed to GET {l}, got a {http_err.response.status_code}: {http_err.response.text}" ) continue except requests.RequestException as err: logger.debug(f"failed to GET {l}: {err!r}") continue # FIXME(tsileo): check mimetype via the URL too (like we do for images) if not r.headers.get("content-type") or not r.headers.get( "content-type").startswith("text/html"): continue r.encoding = "UTF-8" html = r.text try: data = dict(opengraph.OpenGraph(html=html)) except Exception: logger.exception(f"failed to parse {l}") continue # Keep track of the fetched URL as some crappy websites use relative URLs everywhere data["_input_url"] = l u = urlparse(l) # If it's a relative URL, build the absolute version if "image" in data and data["image"].startswith("/"): data["image"] = u._replace(path=data["image"], params="", query="", fragment="").geturl() if "url" in data and data["url"].startswith("/"): data["url"] = u._replace(path=data["url"], params="", query="", fragment="").geturl() if data.get("url"): res.append(data) return res
def test_urlutils_check_url_helper(): with pytest.raises(urlutils.InvalidURLError): urlutils.check_url("http://localhost:5000")