def metadata_from_url(url, microdata=True, jsonld=True, rdfa=True): resp = requests.get(url, timeout=30) result = { 'url': url, 'status': '{} {}'.format(resp.status_code, resp.reason) } try: resp.raise_for_status() except requests.exceptions.HTTPError: return result parser = XmlDomHTMLParser(encoding=resp.encoding) tree = lxml.html.fromstring(resp.content, parser=parser) if microdata: mde = MicrodataExtractor(nested=True) result['microdata'] = mde.extract_items(tree, resp.url) if jsonld: jsonlde = JsonLdExtractor() result['json-ld'] = jsonlde.extract_items(tree, resp.url) if rdfa: rdfae = RDFaExtractor() result['rdfa'] = rdfae.extract_items(tree, resp.url) return result
def async_extruct(url, microdata=True, jsonld=True): response.content_type = 'application/json' resp = requests.get(url, timeout=30) parser = lxml.html.HTMLParser(encoding=resp.encoding) lxmldoc = lxml.html.fromstring(resp.content, parser=parser) result = {'url': url, 'status': 'ok'} if microdata: mde = MicrodataExtractor(nested=True) result['microdata'] = mde.extract_items(lxmldoc, url) if jsonld: jsonlde = JsonLdExtractor() result['json-ld'] = jsonlde.extract_items(lxmldoc) return result
def async_extruct(url, microdata=True, jsonld=True): resp = requests.get(url, timeout=30) parser = lxml.html.HTMLParser(encoding=resp.encoding) lxmldoc = lxml.html.fromstring(resp.content, parser=parser) result = {"url": url, "status": "ok"} if microdata: mde = MicrodataExtractor(nested=True) microdata = mde.extract_items(lxmldoc, url) if microdata.get("items", []): result["microdata"] = microdata if jsonld: jsonlde = JsonLdExtractor() jsonldata = jsonlde.extract_items(lxmldoc) if jsonldata.get("items", []): result["json-ld"] = jsonldata return result
def async_extruct(url, microdata=True, jsonld=True): response.content_type = 'application/json' resp = requests.get(url, timeout=30) parser = lxml.html.HTMLParser(encoding=resp.encoding) lxmldoc = lxml.html.fromstring(resp.content, parser=parser) result = {'url': url, 'status': 'ok'} if microdata: mde = MicrodataExtractor(nested=True) microdata = mde.extract_items(lxmldoc, url) if microdata.get('items', []): result['microdata'] = microdata if jsonld: jsonlde = JsonLdExtractor() jsonldata = jsonlde.extract_items(lxmldoc) if jsonldata.get('items', []): result['json-ld'] = jsonldata return result