Пример #1
0
def metadata_from_url(url, microdata=True, jsonld=True, rdfa=True):
    resp = requests.get(url, timeout=30)
    result = {
        'url': url,
        'status': '{} {}'.format(resp.status_code, resp.reason)
    }
    try:
        resp.raise_for_status()
    except requests.exceptions.HTTPError:
        return result

    parser = XmlDomHTMLParser(encoding=resp.encoding)
    tree = lxml.html.fromstring(resp.content, parser=parser)

    if microdata:
        mde = MicrodataExtractor(nested=True)
        result['microdata'] = mde.extract_items(tree, resp.url)

    if jsonld:
        jsonlde = JsonLdExtractor()
        result['json-ld'] = jsonlde.extract_items(tree, resp.url)

    if rdfa:
        rdfae = RDFaExtractor()
        result['rdfa'] = rdfae.extract_items(tree, resp.url)

    return result
Пример #2
0
def async_extruct(url, microdata=True, jsonld=True):
    response.content_type = 'application/json'
    resp = requests.get(url, timeout=30)

    parser = lxml.html.HTMLParser(encoding=resp.encoding)
    lxmldoc = lxml.html.fromstring(resp.content, parser=parser)

    result = {'url': url, 'status': 'ok'}

    if microdata:
        mde = MicrodataExtractor(nested=True)
        result['microdata'] = mde.extract_items(lxmldoc, url)

    if jsonld:
        jsonlde = JsonLdExtractor()
        result['json-ld'] = jsonlde.extract_items(lxmldoc)

    return result
Пример #3
0
def async_extruct(url, microdata=True, jsonld=True):
    resp = requests.get(url, timeout=30)

    parser = lxml.html.HTMLParser(encoding=resp.encoding)
    lxmldoc = lxml.html.fromstring(resp.content, parser=parser)

    result = {"url": url, "status": "ok"}

    if microdata:
        mde = MicrodataExtractor(nested=True)
        microdata = mde.extract_items(lxmldoc, url)
        if microdata.get("items", []):
            result["microdata"] = microdata

    if jsonld:
        jsonlde = JsonLdExtractor()
        jsonldata = jsonlde.extract_items(lxmldoc)
        if jsonldata.get("items", []):
            result["json-ld"] = jsonldata

    return result
Пример #4
0
def async_extruct(url, microdata=True, jsonld=True):
    response.content_type = 'application/json'
    resp = requests.get(url, timeout=30)

    parser = lxml.html.HTMLParser(encoding=resp.encoding)
    lxmldoc = lxml.html.fromstring(resp.content, parser=parser)

    result = {'url': url, 'status': 'ok'}

    if microdata:
        mde = MicrodataExtractor(nested=True)
        microdata = mde.extract_items(lxmldoc, url)
        if microdata.get('items', []):
            result['microdata'] = microdata

    if jsonld:
        jsonlde = JsonLdExtractor()
        jsonldata = jsonlde.extract_items(lxmldoc)
        if jsonldata.get('items', []):
            result['json-ld'] = jsonldata

    return result