def __init__(self, url, test=False): self.format = None self.testing_mode = test self.data = {} self.format = None if test: # when testing, we load a file with url: r = url.read() data = extruct.extract( r, base_url= "https://www.allrecipes.com/recipe/133948/four-cheese-margherita-pizza/", syntaxes=SYNTAXES, uniform=True, ) else: r = requests.get(url, headers=HEADERS) data = extruct.extract( r.text, base_url=get_base_url(r.text, r.url), syntaxes=SYNTAXES, uniform=True, ) for syntax in SYNTAXES: for item in data.get(syntax, []): if ("@context" in item and item["@context"] == SCHEMA_ORG_HOST and "@type" in item and item["@type"].lower() == SCHEMA_NAME.lower()): self.format = syntax self.data = item return
def get_article_content(url): if extruct.extract(get_html(url)).get('microdata'): metadata = extruct.extract(get_html(url)).get('microdata') return parse_microdata(metadata) elif extruct.extract(get_html(url)).get('json-ld'): metadata = extruct.extract(get_html(url)).get('json-ld') return parse_json_ld(metadata)
def load(fp: Union[str, IO[str]], python_objects: bool = False, nonstandard_attrs: bool = False, migrate_old_schema: bool = True) -> List[Dict]: """load a filename or file object to scrape Parameters ---------- fp : string or file-like object A file name or a file-like object. python_object : bool, optional when True it translates some data types into python objects dates into datetime.date, datetimes into datetime.datetimes, durations as dateime.timedelta. (defaults to False) nonstandard_attrs : bool, optional when True it adds nonstandard (for schema.org/Recipe) attributes to the resulting dictionaries, that are outside the specification such as: '_format' is either 'json-ld' or 'microdata' (how schema.org/Recipe was encoded into HTML) '_source_url' is the source url, when 'url' has already been defined as another value (defaults to False) migrate_old_schema : boool, optional when True it migrates the schema from older version to current version (defaults to True) Returns ------- list a list of dictionaries in the style of schema.org/Recipe JSON-LD no results - an empty list will be returned """ data = {} # type: Dict[str, List[Dict]] if isinstance(fp, str): with open(fp) as f: data = extruct.extract(f.read()) elif hasattr(fp, 'read'): # Assume this is some kind of file-like object that can be read. data = extruct.extract(fp.read()) else: raise TypeError('expected, fp to be a filename or a file-like object, ' 'fp is of type {}'.format(type(fp))) scrapings = _convert_to_scrapings(data, nonstandard_attrs) if migrate_old_schema is True: scrapings = _migrate_old_schema(scrapings) if python_objects is True: scrapings = _pythonize_objects(scrapings) return scrapings
def test_errors(self): body = '' # raise exceptions with pytest.raises(Exception): data = extruct.extract(body) # ignore exceptions expected = {} data = extruct.extract(body, errors='ignore') assert data == expected # ignore exceptions data = extruct.extract(body, errors='log') assert data == expected
def get_json_ld_description(url): description = None error_code = None ## TODO: Get detail on the errors try: r = requests.get(url, timeout=5) except: error_code = '404' return description, error_code try: base_url = get_base_url(r.text, r.url) except: error_code = '500' return description, error_code try: data = extruct.extract(r.text, base_url=base_url,syntaxes=['json-ld']) # except Exception as e: error_code = '500' return description, error_code jl = data['json-ld'] for l in jl: if l.get('description',None): description = l['description'] return description, '200'
def resolve_jsonld(html): from extruct import extract ld = extract(html, syntaxes=['json-ld', 'microdata', 'opengraph', 'rdfa'], uniform=True) # TODO: deal with other types that aren't json-ld # by converting them. logging.info('!', ld) return ld
def get_metadata(html: bytes, url: str): """Fetch JSON-LD structured data.""" metadata = extruct.extract(html, base_url=get_base_url(url), syntaxes=['json-ld'], uniform=True)['json-ld'] return metadata
def extract(html, url): # data = BeautifulSoup(html, 'html.parser') data = extruct.extract( html, url, syntaxes=['microdata', 'json-ld', 'rdfa', 'microformat'], errors='log') schemas = [] if data.get('json-ld'): for schema in data['json-ld']: if schema.get('@type', None): # print('*****' + schema['@type']) type = schema_pattern_match(schema['@type']) if type: schema['@type'] = type schemas.append(schema) if data.get('microdata'): for schema in data['microdata']: # print('*****' + schema['type']) type = schema_pattern_match(schema['type']) if type: schema['type'] = type schemas.append(schema) if data.get('microformat'): for schema in data['microformat']: # print('*****' + schema['type']) type = schema_pattern_match(schema['type']) if type: schema['type'] = type schemas.append(schema) return schemas
def get_jsons(url): r = requests.get(url) base_url = get_base_url(r.text, r.url) data = extruct.extract(r.text, base_url=base_url) if 'json-ld' not in data: raise Exception('No se encuentran datos json-ld') return data['json-ld']
def test_deprecated_url(self): body, expected = self._microdata_custom_url('product_custom_url.json') with pytest.warns(DeprecationWarning): data = extruct.extract(body, url='http://some-example.com', syntaxes=['microdata']) self.assertEqual(data, expected)
def get_recipe_data(url): def _find_recipe(c): if isinstance(c, dict): if "@type" in c.keys() and c["@type"] == "Recipe": return c for i in c.values(): res = _find_recipe(i) if res: return res if isinstance(c, list): for i in c: res = _find_recipe(i) if res: return res return [] html = requests.get(url, headers=HEADERS, cookies=COOKIES) data_list = extract(html.text, uniform=True) recipe_data = _find_recipe(data_list) if not recipe_data: raise MissingSchema( "Website does not provide a schema.org Recipe schema in a json-ld format" ) return recipe_data
def test_umicrodata(self): expected = [{ "@context": "http://schema.org", "@type": "Product", "brand": "ACME", "name": "Executive Anvil", "image": "anvil_executive.jpg", "description": "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.", "mpn": "925872", "aggregateRating": { "@type": "AggregateRating", "ratingValue": "4.4", "reviewCount": "89" }, "offers": { "@type": "Offer", "priceCurrency": "USD", "price": "119.99", "priceValidUntil": "2020-11-05", "seller": { "@type": "Organization", "name": "Executive Objects" }, "itemCondition": "http://schema.org/UsedCondition", "availability": "http://schema.org/InStock" } }] body = get_testdata('misc', 'product_microdata.html') data = extruct.extract(body, syntaxes=['microdata'], uniform=True) self.assertEqual(data['microdata'], expected)
def process_spider_output(self, response, result, spider): """get all metadata and add them as fields to item""" logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') for scraped_item in result: if isinstance(scraped_item, Request): # yield the request without making changes yield scraped_item else: # if this is an item inspect for microdata data = extruct.extract( response.body, response.url, syntaxes=['microdata', 'opengraph', 'rdfa', 'json-ld'], uniform=True) logging.debug(response.url) scraped_item['url'] = response.url # scraped_item['source_spider'] = spider.name if not data: # if no microdata was found set flag and yield the item logging.debug('this item has no microdata') scraped_item['hasMetaData'] = False else: scraped_item['hasMetaData'] = True scraped_item['microdatas'] = data yield scraped_item
def test_uopengraph(self): expected = [{ "@context": { "og": "http://ogp.me/ns#", "fb": "http://www.facebook.com/2008/fbml", "concerts": "http://ogp.me/ns/fb/songkick-concerts#" }, "fb:app_id": "308540029359", "og:site_name": "Songkick", "@type": "songkick-concerts:artist", "og:title": "Elysian Fields", "og:description": "Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017.", "og:url": "http://www.songkick.com/artists/236156-elysian-fields", "og:image": "http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg", }] body = get_testdata('songkick', 'elysianfields.html') data = extruct.extract(body, syntaxes=['opengraph'], uniform=True) self.assertEqual(data['opengraph'], expected)
def process_spider_output(self, response, result, spider): """get all metadata and add them as fields to item""" logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') for x in result: if isinstance(x, Request): # yield the request without making changes #logging.debug('this is a request and not an item') yield x else: # if this is an item inspect for microdata data = extruct.extract(response.body, response.url) logging.debug(response.url) x['url'] = response.url if not data: # if no microdata was found set flag and yield the item logging.debug('this item has no microdata') x['hasMetaData'] = False else: # if there is microdata parse it to item x['hasMetaData'] = True logging.debug('data is an object of type') logging.debug(type(data)) logging.debug('parsing microdata fields') for micodatatype, microdatafields in data.items(): logging.debug(micodatatype) logging.debug(microdatafields) x[micodatatype] = microdatafields yield x
def parse(self, response): if (len(self.claimdf) == 100): raise CloseSpider('100 crawled') data = extruct.extract(response.text, response.url)['microdata'] selected = [ properties for properties in data if properties['type'] == 'http://schema.org/ClaimReview' ] for elements in selected: print "#######################################################################" print "#######################################################################" print "#######################################################################" print "#######################################################################" print "#######################################################################" print "YOOOOOOOOOOOOOOOOO" dictt = elements['properties'] for key in dictt: if type(dictt[key]) == list: dictt[key] = dictt[key][0] self.claimdf = self.claimdf.append( pd.io.json.json_normalize(dictt), ignore_index=True) self.claimdf.to_csv('metro.se.csv', encoding='utf-8') print "#######################################################################" print "#######################################################################" print "#######################################################################" print "#######################################################################" print "#######################################################################"
def get_opengraph_description(url): description = None error_code = None ## TODO: Get detail on the errors try: r = requests.get(url, timeout=5) except: error_code = '404' return description, error_code try: base_url = get_base_url(r.text, r.url) except: error_code = '500' return description, error_code try: data = extruct.extract(r.text, base_url=base_url,syntaxes=['opengraph']) # except Exception as e: error_code = '500' return description, error_code og = data['opengraph'] for o in og: if o.get('properties',None): for i in o['properties']: if i[0] == 'og:description': description = i[1] return description, '200'
def extraer_informacion_web(): pp = pprint.PrettyPrinter(indent=2) r = requests.get( 'https://www.casadellibro.com/libro-los-renglones-torcidos-de-hollywood/9788412094749/11187413' ) base_url = get_base_url(r.text, r.url) data = extruct.extract(r.text, base_url=base_url) schema = data['json-ld'] soup = BeautifulSoup(r.text, 'lxml') imagen = soup.find('img', {'class': 'product-image'}) descripcion = soup.find('div', {'class': 'hidden-sm-and-down'}) desc = descripcion.text desc = desc.replace("Ver más", '').strip() if "CRÍTICAS" in desc: desc = desc.split("CRÍTICAS") desc = desc[0] if "Resumen" in desc: desc = desc.split("Resumen") desc = desc[1] obj = {} obj['desc'] = desc obj['imagen'] = imagen['data-src'] obj['schema'] = schema print(obj) print(type(obj))
def get_json_ld_headline(url): headline = None error_code = None ## TODO: Get detail on the errors try: r = requests.get(url, timeout=5) except: error_code = '404' return headline, error_code try: base_url = get_base_url(r.text, r.url) except: error_code = '500' return headline, error_code try: data = extruct.extract(r.text, base_url=base_url,syntaxes=['json-ld']) # except Exception as e: error_code = '500' return headline, error_code jl = data['json-ld'] for l in jl: if l.get('headline',None): headline = l['headline'] headline = (headline[:197] + '...') if len(headline) > 197 else headline return headline, '200'
def load_jsonld(): # read the file with open(source_file_path) as f: content = f.read() # extract the embedded metadata https://github.com/scrapinghub/extruct data = extruct.extract(content) claimReviews = data['json-ld'] # some analysis of the labels to see how they are annotated labels = set([el['reviewRating']['alternateName'] for el in claimReviews]) lambda_source = lambda el: el['author']['name'] # group the labels by the author of the review, to see how each one of them uses the alternateName labels_by_sources = { k: set([el['reviewRating']['alternateName'] for el in v]) for k, v in itertools.groupby(sorted(claimReviews, key=lambda_source), key=lambda_source) } print('#claimReviews', len(claimReviews)) print('#labels', len(labels)) #print('labels', labels) print('#label for each source', {k: len(v) for k, v in labels_by_sources.items()}) # save the original claimReviews utils.write_json_with_path(claimReviews, intermediate_path, 'datacommons_claimReviews.json') return claimReviews
def recursive_page_scrape(self, page): """ Crawls across website and scrapes recipes as discovered. The navigation is recursive, scraping each page it comes upon and all the links on that page. """ # Mark as being read print_message = f"Recipes Found: {len(self.scraped_recipes)} | Scraping {page}" print(print_message.ljust(200, " "), end="\r", flush=True) self.link_library[page] = 1 self.sdb.update_one({"_id": self.source}, {"$set": { "link_library": self.link_library }}) # Scrape it, if it errors out it won't be tried again r = requests.get(page, headers=self.headers) soup = BeautifulSoup(r.content, "html.parser") base_url = get_base_url(r.text, r.url) data = extruct.extract(r.text, base_url=base_url) # Recipe self.scrape_recipe(data, page) # Look for all links on page for link_string in soup.findAll( 'a', attrs={'href': re.compile("^https://")}): link = clean_url(link_string.get('href'), self.utm_pages) if check_link(link, self.domain): if link not in self.link_library.keys(): wait() self.recursive_page_scrape(link)
def perform(kls, inputs): urls = inputs['target:url'] for url in urls: try: r = requests.get(url) base_url = get_base_url(r.text, r.url) data = extruct.extract(r.text, base_url=base_url, syntaxes=['json-ld'])['json-ld'] tree = Tree(data) break except: data = None return dict( **{ 'metric:30': { 'answer': 1.0 if data else 0.0, 'comment': 'jsonld was found and properly parsed' if data else 'jsonld could not be parsed', }, }, **{ key: { 'answer': 1.0 if attr else 0.0, 'comment': attr if attr else 'json-ld %s not found' % (' '.join(to_schema[key])), } if key.startswith('metric:') else attr for key, attr in zip( to_schema.keys(), map( bind(get_json_ld_attr, tree), to_schema.values() ) ) } if data else {key: {} for key in to_schema.keys()} )
def __init__(self, page_data, host): self.format = None self.host = host self.data = {} data = extruct.extract( page_data, syntaxes=SYNTAXES, uniform=True, ) for syntax in SYNTAXES: for item in data.get(syntax, []): in_context = SCHEMA_ORG_HOST in item.get("@context", "") low_schema = [s.lower() for s in SCHEMA_NAMES] if in_context and item.get("@type", "").lower() in low_schema: self.format = syntax self.data = item if item.get("@type").lower() == 'webpage': self.data = self.data.get('mainEntity') return elif in_context and "@graph" in item: for graph_item in item.get("@graph", ""): in_graph = SCHEMA_ORG_HOST in graph_item.get( "@context", "") if in_graph and graph_item.get( "@type", "").lower() in low_schema: self.format = syntax self.data = graph_item if graph_item.get("@type").lower() == 'webpage': self.data = self.data.get('mainEntity') return
def basic_recipe_from_opengraph(html: str, url: str) -> dict: base_url = get_base_url(html, url) data = extruct.extract(html, base_url=base_url) try: properties = data["opengraph"][0]["properties"] except: return return { "name": og_field(properties, "og:title"), "description": og_field(properties, "og:description"), "image": og_field(properties, "og:image"), "recipeYield": "", # FIXME: If recipeIngredient is an empty list, mongodb's data verification fails. "recipeIngredient": ["Could not detect ingredients"], # FIXME: recipeInstructions is allowed to be empty but message this is added for user sanity. "recipeInstructions": [{"text": "Could not detect instructions"}], "slug": slugify(og_field(properties, "og:title")), "orgURL": og_field(properties, "og:url"), "categories": [], "tags": og_fields(properties, "og:article:tag"), "dateAdded": None, "notes": [], "extras": [], }
def __init__(self, page_data): self.format = None self.data = {} data = extruct.extract(page_data, syntaxes=SYNTAXES, uniform=True) low_schema = {s.lower() for s in SCHEMA_NAMES} for syntax in SYNTAXES: for item in data.get(syntax, []): in_context = SCHEMA_ORG_HOST in item.get("@context", "") if in_context and item.get("@type", "").lower() in low_schema: self.format = syntax self.data = item if item.get("@type").lower() == "webpage": self.data = self.data.get("mainEntity") return elif in_context and "@graph" in item: for graph_item in item.get("@graph", ""): graph_item_type = graph_item.get("@type", "") if not isinstance(graph_item_type, str): continue if graph_item_type.lower() in low_schema: in_graph = SCHEMA_ORG_HOST in graph_item.get( "@context", "") self.format = syntax if graph_item_type.lower( ) == "webpage" and in_graph: self.data = self.data.get("mainEntity") return elif graph_item_type.lower() == "recipe": self.data = graph_item return
def rdfFromHTML(html, base_url): data = extruct.extract(html, syntaxes=['microdata', 'rdfa', 'json-ld'], errors='ignore') print("URL: [" + base_url + "]") # print(html) # print(json.dumps(data, indent=2)) for type_key in data.keys(): # print(type_key) # list of namespaces ns_list = getRdfNameSpaces(data[type_key]) if data[type_key] != []: print(type_key + " Found !") # if data[type_key] != []: if not base_url in URL_WITH_RDF.keys(): URL_WITH_RDF[base_url] = {} URL_WITH_RDF[base_url][type_key] = ns_list print("") return data
def main(url): with open('cytuj_strone.mustache') as template_file: template = template_file.read() headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36" } res = requests.get(url, headers=headers) data = extruct.extract(res.text, res.url) print(json.dumps(data)) # return dd = {} if 'json-ld' in data and len(data.get('json-ld', [])) != 0: dd = parse_ld(data) elif 'microdata' in data: dd = parse_micro(data) dzisiaj = datetime.datetime.now().isoformat()[:10] dd['data_d'] = dzisiaj if not dd.get('url'): dd['url'] = url msg = "{{" + pystache.render(template, dd) + "}}" print(msg)
def extract_rdf_extruct(self, url) -> ConjunctiveGraph: while True: try: response = requests.get(url=url, timeout=10) break except SSLError: time.sleep(5) except requests.exceptions.Timeout: print("Timeout, retrying") time.sleep(5) except requests.exceptions.ConnectionError as e: print(e) print("ConnectionError, retrying...") time.sleep(10) self.status_code = response.status_code html_source = response.content data = extruct.extract( html_source, syntaxes=["microdata", "rdfa", "json-ld"], errors="ignore" ) kg = ConjunctiveGraph() base_path = Path(__file__).parent.parent # current directory static_file_path = str((base_path / "static/data/jsonldcontext.json").resolve()) if "json-ld" in data.keys(): for md in data["json-ld"]: if "@context" in md.keys(): if ("https://schema.org" in md["@context"]) or ( "http://schema.org" in md["@context"] ): md["@context"] = static_file_path kg.parse(data=json.dumps(md, ensure_ascii=False), format="json-ld") if "rdfa" in data.keys(): for md in data["rdfa"]: if "@context" in md.keys(): if ("https://schema.org" in md["@context"]) or ( "http://schema.org" in md["@context"] ): md["@context"] = static_file_path kg.parse(data=json.dumps(md, ensure_ascii=False), format="json-ld") if "microdata" in data.keys(): for md in data["microdata"]: if "@context" in md.keys(): if ("https://schema.org" in md["@context"]) or ( "http://schema.org" in md["@context"] ): md["@context"] = static_file_path kg.parse(data=json.dumps(md, ensure_ascii=False), format="json-ld") logging.debug(kg.serialize(format="turtle")) kg.namespace_manager.bind("sc", URIRef("http://schema.org/")) kg.namespace_manager.bind("bsc", URIRef("https://bioschemas.org/")) kg.namespace_manager.bind("dct", URIRef("http://purl.org/dc/terms/")) return kg
def _get_extracted_data(self): extracted_data = extruct.extract(self.html, self.url, errors='ignore', syntaxes=['microdata'], uniform=True) return extracted_data['microdata']
def test_all(self): body = get_testdata('songkick', 'elysianfields.html') expected = json.loads( get_testdata('songkick', 'elysianfields.json').decode('UTF-8')) data = extruct.extract( body, base_url='http://www.songkick.com/artists/236156-elysian-fields') self.assertEqual(jsonize_dict(data), expected)
def _dochits_to_objset(self, docHits): '''Returns list of objects. ''' objset = [] for d in docHits: r = requests.get(d.text) # get JSON-LD from the page base_url = get_base_url(r.text) data = extruct.extract(r.text, base_url) jsld = data.get('json-ld')[0] obj = {} obj_mdata = defaultdict(list) for mdata in jsld: obj_mdata[mdata] = jsld[mdata] obj['metadata'] = dict(obj_mdata) objset.append(obj) self.docs_fetched += 1 return objset