def metadata_from_url(url, microdata=True, jsonld=True, rdfa=True): resp = requests.get(url, timeout=30) result = { 'url': url, 'status': '{} {}'.format(resp.status_code, resp.reason) } try: resp.raise_for_status() except requests.exceptions.HTTPError: return result parser = XmlDomHTMLParser(encoding=resp.encoding) tree = lxml.html.fromstring(resp.content, parser=parser) if microdata: mde = MicrodataExtractor(nested=True) result['microdata'] = mde.extract_items(tree, resp.url) if jsonld: jsonlde = JsonLdExtractor() result['json-ld'] = jsonlde.extract_items(tree, resp.url) if rdfa: rdfae = RDFaExtractor() result['rdfa'] = rdfae.extract_items(tree, resp.url) return result
def parse(self, response): mde = MicrodataExtractor() data = mde.extract(response.body) for item in data: if item['type'] in self.target_types: record = {'indexed_date': datetime.date.today().isoformat(), 'url': response.url, 'body': item} yield record
def parse_item(self, response): items = [] def microdata2jsonld(md): if md.get('properties'): item = md['properties'] item['@type'] = md.get('type') return item items += map(microdata2jsonld, MicrodataExtractor().extract( response.body_as_unicode(), response.url)['items']) items += JsonLdExtractor().extract( response.body_as_unicode(), response.url)['items'] if not items: self.logger.debug("No Microdata items found for %s", response.url) self.logger.debug("Checking URL for item: %s" , items) for item in items: if not item or not item.get('url'): self.logger.debug("No URL for item: %s" , item) continue if item['url'] != response.url: self.logger.debug("Not in main URL, go there..") yield Request(item['url'], callback=self.parse_item) else: item['@type'] = item.get('type') self.logger.debug("Parsed microdata: %s" % item) yield item
def __init__(self, response, microdata=False, jsonld=False, rdfa=False): self.response = response self.microdata = microdata self.jsonld = jsonld self.rdfa = rdfa if rdfa: try: self.rdfae = RDFaExtractor() self.rdfadata = self.rdfae.extract(self.response.text, url=self.response.url) except JSONDecodeError: pass if microdata: try: self.mde = MicrodataExtractor() self.mdedata = self.mde.extract(self.response.text) except JSONDecodeError: pass if jsonld: try: self.jlde = JsonLdExtractor() self.jldata = self.jlde.extract(self.response.text) except (JSONDecodeError, TypeError): self.jldata = [] finally: # Sometimes we get this in the meta dict from RISJExtractJSONLD self.jldata.extend(self.response.meta.get('json-ld', []))
def parse_listing(self, response): mde = MicrodataExtractor() data = mde.extract(response.body)['items'] if data: it = {} it['shop'] = data[0]['properties'] prod = data[1]['properties'] it.update(prod['offerDetails']['properties']) it['name'] = prod['name'] it['url'] = response.url it['properties'] = [x for x in response.css('#item-overview .properties li::text').extract() \ if all(y not in x.lower() for y in ['materials','feedback', 'favorited', 'ships'])] it['materials'] = e0(response.css('#overview-materials::text')) it['origin'] = e0(response.css('.origin::text')) it['imgs'] = response.css( '#image-carousel img::attr("src")').extract() it['description'] = e0(response.css("#description-text")) it['tags'] = response.css('#listing-tag-list li a::text').extract() it['fineprints'] = [ x.strip() for x in response.css('#fineprint li::text').extract()[:4] ] it['rating'] = response.css( '.review-rating meta::attr("content")').extract() #it['html'] = response.body yield it
def __init__(self, response): mde = MicrodataExtractor() try: self.data = mde.extract(response.body, response.url) except: self.data = mde.extract(response.body.decode('latin-1'), response.url)
def parse(self, response): for beacon_page in response: mde = MicrodataExtractor() beacon_data=mde.extract(html_content) yield{ beacon_data }
def test_w3c_meter_element(self): body = get_testdata('w3c', 'microdata.4.2.meter.html') expected = json.loads( get_testdata('w3c', 'microdata.4.2.meter.json').decode('UTF-8')) mde = MicrodataExtractor(strict=True) data = mde.extract(body) self.assertDictEqual(data, expected)
def test_w3c_object_element(self): body = get_testdata('w3c', 'microdata.object.html') expected = json.loads( get_testdata('w3c', 'microdata.object.json').decode('UTF-8')) mde = MicrodataExtractor(strict=True) data = mde.extract(body, 'http://www.example.com/microdata/test') self.assertEqual(data, expected)
def test_w3c_7_1(self): body = get_testdata('w3c', 'microdata.7.1.html') expected = json.loads( get_testdata('w3c', 'microdata.7.1.flat.json').decode('UTF-8')) mde = MicrodataExtractor(nested=False, strict=True) data = mde.extract(body, 'http://blog.example.com/progress-report') self.assertEqual(data, expected)
def test_w3c_5_2(self): body = get_testdata('w3c', 'microdata.5.2.html') expected = json.loads( get_testdata('w3c', 'microdata.5.2.withtext.json').decode('UTF-8')) mde = MicrodataExtractor(add_text_content=True) data = mde.extract(body) self.assertEqual(data, expected)
def get_microdata_extruct_items(htmltext): mde = MicrodataExtractor() try: items = mde.extract(htmltext) except XMLSyntaxError: return # Nothing to do here return items
def test_join_none(self): body = get_testdata('schema.org', 'product-ref.html') expected = json.loads( get_testdata('schema.org', 'product-ref.json').decode('UTF-8')) mde = MicrodataExtractor() data = mde.extract(body) self.assertEqual(data, expected)
def test_w3c_textContent_values(self): body = get_testdata('w3c', 'microdata.4.2.strings.html') expected = json.loads( get_testdata('w3c', 'microdata.4.2.strings.json').decode('UTF-8')) mde = MicrodataExtractor(strict=True) data = mde.extract(body) self.assertEqual(data, expected)
def test_w3c_5_5(self): body = get_testdata('w3c', 'microdata.5.5.html') expected = json.loads( get_testdata('w3c', 'microdata.5.5.json').decode('UTF-8')) mde = MicrodataExtractor(strict=True) data = mde.extract(body) self.assertEqual(data, expected)
def parse_product(self, response): mde = MicrodataExtractor() data = mde.extract(response.body) product_data = data['items'][1]['properties'] extra_data = {} for l in response.body.split('\n'): if 'vgoogle_ecommProd' in l: line_data = l.strip() key = line_data.split(':')[0].strip().replace( 'vgoogle_ecommProd', '') value = line_data.split(':')[1][3:-3] if key not in extra_data: extra_data[key] = value loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', product_data.get('sku', extra_data['ID'])) loader.add_value('sku', product_data.get('sku', extra_data['ID'])) loader.add_value('name', product_data.get('name', extra_data['Name'])) loader.add_value('url', response.url) if 'price' in product_data: loader.add_value('price', product_data['price']) elif 'Price' in extra_data: loader.add_value('price', extra_data['Price']) else: price = response.xpath('//*[(contains(@class, "product-item") and ' 'contains(@class, "product-price")) or @id="price-amount"]//text()')\ .re(r'[\d\.,]+') loader.add_value('price', price) try: loader.add_value( 'category', data['items'][0]['properties']['itemListElement'] [1]['properties']['name']) except: loader.add_value('category', extra_data['Cat']) loader.add_value('brand', product_data.get('manufacturer', extra_data['Brand'])) try: loader.add_value( 'image_url', response.urljoin( response.xpath( '//div[@id="prod-img-placehold"]/img/@srcset').re( r'(.*\.jpg)')[0].split(',')[-1].strip())) except: pass item = loader.load_item() metadata = SpecSaversMeta() promotional_data = response.xpath( '//div[@class="arrow-container"]/div/text()').extract() metadata['promotion'] = promotional_data[0].strip( ) if promotional_data else '' item['metadata'] = metadata yield item
def parse(self, response): extractor = MicrodataExtractor() items = extractor.extract(response.text, response.url)['items'] for it in items: yield it['properties'] next_page_url = response.css("li.next > a::attr(href)").extract_first() if next_page_url is not None: yield scrapy.Request(response.urljoin(next_page_url))
def extract(htmlstring, url='http://www.example.com/', encoding="UTF-8"): domparser = XmlDomHTMLParser(encoding=encoding) tree = fromstring(htmlstring, parser=domparser) return { name: extractor.extract_items(tree, url=url) for name, extractor in (('json-ld', JsonLdExtractor()), ('microdata', MicrodataExtractor()), ('rdfa', RDFaExtractor())) }
def test_join_custom_url(self): body = get_testdata('schema.org', 'product.html') expected = json.loads( get_testdata('schema.org', 'product_custom_url.json').decode('UTF-8')) mde = MicrodataExtractor() data = mde.extract(body, base_url='http://some-example.com') self.assertEqual(data, expected)
def parse(self, response): extractor = MicrodataExtractor() properties = extractor.extract(response.body_as_unicode()).get('items')[0].get('properties', {}) item = response.meta.get('item', {}) item['url'] = response.url item['title'] = properties.get('name').replace('Details about', '').strip() item['price'] = float( properties.get('offers', {}).get('properties', {}).get('price', 0) ) yield item
def test_schemaorg_Event(self): for i in [1, 2, 3, 4, 8]: body = get_testdata('schema.org', 'Event.{:03d}.html'.format(i)) expected = json.loads( get_testdata('schema.org', 'Event.{:03d}.json'.format(i)).decode('UTF-8')) mde = MicrodataExtractor() data = mde.extract(body) self.assertEqual(data, expected)
def test_if_punctuations_in_description_are_correctly_formatted(self): body = get_testdata('websites', 'microdata-with-description.html') expected = json.loads( get_testdata('websites', 'microdata-with-description.json').decode('UTF-8')) mde = MicrodataExtractor() data = mde.extract(body) self.assertEqual(data, expected)
def parse_item(self, response): """Parse the recipe to get title and ingredients.""" schema_type = "mde" mde = MicrodataExtractor() data = mde.extract(response.body) # print('response.body:', response.body) # print('data:', data) if len(data) == 0: jslde = JsonLdExtractor() data = jslde.extract(response.body) schema_type = "jsonld" if schema_type == "mde": recipe = data[0]['properties'] # recipe_output_item = RecipeItem() # recipe_output_item['recipe_name'] = recipe['name'] # recipe_output_item['ingredients'] = [ # ingredient for ingredient in recipe['ingredients'] # if ingredient not in ['', 'Add all ingredients to list'] # ] # recipe_output_item['tags'] = [tag['properties']['title'] # for tag in data['items'][1:]] # try: # recipe_output_item['description'] = recipe['description'] # except KeyError: # recipe_output_item['description'] = None # recipe_output_item['url'] = recipe['url'] elif schema_type == "jsonld": recipe = data['items'][0] # recipe_output_item = RecipeItem() # recipe_output_item['recipe_name'] = recipe['name'] # recipe_output_item['ingredients'] = recipe['ingredients'] # recipe_output_item['tags'] = [tag['properties']['title'] # for tag in data['items'][1:]] # try: # recipe_output_item['description'] = recipe['description'] # except KeyError: # recipe_output_item['description'] = None # recipe_output_item['url'] = recipe['url'] properties = [ 'totalTime', 'nutrition', 'name', 'author', 'url', 'image', 'recipeIngredient', 'aggregateRating', 'recipeYield', 'recipeInstructions', 'video', 'mainEntityOfPage', 'cookTime', 'recipeCategory', 'review', 'prepTime', 'description' ] recipe_output_item = RecipeItem() for prop in properties: try: recipe_output_item[prop] = recipe[prop] except KeyError: recipe_output_item[prop] = None yield recipe_output_item
def test_schemaorg_LocalBusiness(self): for i in [2, 3]: body = get_testdata('schema.org', 'LocalBusiness.{:03d}.html'.format(i)) expected = json.loads( get_testdata( 'schema.org', 'LocalBusiness.{:03d}.json'.format(i)).decode('UTF-8')) mde = MicrodataExtractor() data = mde.extract(body) self.assertEqual(data, expected)
def test_schemaorg_MusicRecording(self): for i in [1]: body = get_testdata('schema.org', 'MusicRecording.{:03d}.html'.format(i)) expected = json.loads( get_testdata( 'schema.org', 'MusicRecording.{:03d}.json'.format(i)).decode('UTF-8')) mde = MicrodataExtractor() data = mde.extract(body) self.assertDictEqual(data, expected)
def parse(self, response): mde = MicrodataExtractor() data = mde.extract(response.body) for item in data['items']: if item['type'] in self.target_types: item['indexed_date'] = datetime.date.today().isoformat() item['url'] = response.url yield item for url in response.xpath('//a/@href').extract(): if '/events' in url: yield scrapy.Request(response.urljoin(url), callback=self.parse)
def extract(self, html_text: str, extract_title: bool = False, extract_meta: bool = False, extract_microdata: bool = False, microdata_base_url: str = "", extract_json_ld: bool = False, extract_rdfa: bool = False, rdfa_base_url: str = "") \ -> List[Extraction]: """ Args: html_text (str): input html string to be extracted extract_title (bool): True if string of 'title' tag needs to be extracted, return as { "title": "..." } extract_meta (bool): True if string of 'meta' tags needs to be extracted, return as { "meta": { "author": "...", ...}} extract_microdata (bool): True if microdata needs to be extracted, returns as { "microdata": [...] } microdata_base_url (str): base namespace url for microdata, empty string if no base url is specified extract_json_ld (bool): True if json-ld needs to be extracted, return as { "json-ld": [...] } extract_rdfa (bool): True if rdfs needs to be extracted, returns as { "rdfa": [...] } rdfa_base_url (str): base namespace url for rdfa, empty string if no base url is specified Returns: List[Extraction]: the list of extraction or the empty list if there are no matches. """ res = list() soup = BeautifulSoup(html_text, 'html.parser') if soup.title and extract_title: title = self._wrap_data("title", soup.title.string.encode('utf-8').decode('utf-8')) res.append(title) if soup.title and extract_meta: meta_content = self._wrap_meta_content(soup.find_all("meta")) meta_data = self._wrap_data("meta", meta_content) res.append(meta_data) if extract_microdata: mde = MicrodataExtractor() mde_data = self._wrap_data("microdata", mde.extract(html_text, microdata_base_url)) res.append(mde_data) if extract_json_ld: jslde = JsonLdExtractor() jslde_data = self._wrap_data("json-ld", jslde.extract(html_text)) res.append(jslde_data) if extract_rdfa: rdfae = RDFaExtractor() rdfae_data = self._wrap_data("rdfa", rdfae.extract(html_text, rdfa_base_url)) res.append(rdfae_data) return res
def parse_product(self, response): mde = MicrodataExtractor() try: micro_data = mde.extract(response.body)['items'] gen_data = filter(lambda a: a['type'] == 'http://schema.org/Product', micro_data)[0]['properties'] categories = [c['properties']['title'] for c in filter(lambda d: d['type'] == 'http://data-vocabulary.org/Breadcrumb', micro_data)][1:] except: self.log('WARNING => Wrong product page in %s' % response.url) return main_name = gen_data['name'] if isinstance(main_name, list): main_name = main_name[0] main_brand = gen_data.get('brand', '') if isinstance(main_brand, list): main_brand = main_brand[0] variants = response.xpath('//input[@name="ctl00$cphMain$ctl00$hidProductVariants"]/@value').extract() if variants: data = json.loads(self.html_parser.unescape(variants[0])) for d in data: for var in d['Variants']: for size_data in var['Variants']: color_name = size_data.get('Article', dict()).get('ColorName', '') size_data = size_data['Article'] url = self.product_url % size_data identifier = size_data['ItemOfferId'] name = main_name + ', ' + color_name + ', ' + size_data['FriendlySize'] price = size_data['WebInfo']['ArticlePriceDisplay']['FormattedSalePriceAfterWithCharges'] shipping_cost = size_data['FormattedDeliveryFee'] loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name) loader.add_value('url', url) loader.add_value('identifier', identifier) loader.add_value('sku', size_data['ProductId']) loader.add_value('price', extract_price_eu(price)) if shipping_cost: loader.add_value('shipping_cost', extract_price_eu(shipping_cost)) loader.add_value('image_url', gen_data['image'][-1]) if size_data['AvailabilityCode'] != 'L': loader.add_value('stock', 0) loader.add_value('category', categories) if main_brand: loader.add_value('brand', main_brand) yield loader.load_item() else: self.log('WARNING: Variants not found in => %s' % response.url)
def async_extruct(url, microdata=True, jsonld=True): response.content_type = 'application/json' resp = requests.get(url, timeout=30) parser = lxml.html.HTMLParser(encoding=resp.encoding) lxmldoc = lxml.html.fromstring(resp.content, parser=parser) result = {'url': url, 'status': 'ok'} if microdata: mde = MicrodataExtractor(nested=True) result['microdata'] = mde.extract_items(lxmldoc, url) if jsonld: jsonlde = JsonLdExtractor() result['json-ld'] = jsonlde.extract_items(lxmldoc) return result
def parse_item(self, response): """Parse the recipe to get title and ingredients.""" schema_type = "mde" mde = MicrodataExtractor() data = mde.extract(response.body) if len(data['items']) == 0: jslde = JsonLdExtractor() data = jslde.extract(response.body) schema_type = "jsonld" if schema_type == "mde": recipe = data['items'][2]['properties'] recipe_output_item = RecipeItem() recipe_output_item['recipe_name'] = recipe['name'] recipe_output_item['ingredients'] = [ ingredient for ingredient in recipe['ingredients'] if ingredient not in ['', 'Add all ingredients to list'] ] recipe_tags = recipe['recipeCategory'] if 'recipeCuisine' in recipe.keys(): recipe_tags.append(recipe['recipeCuisine']) recipe_output_item['tags'] = recipe_tags try: recipe_output_item['description'] = recipe['description'] except KeyError: recipe_output_item['description'] = None recipe_output_item['url'] = recipe['url'] elif schema_type == "jsonld": recipe = data['items'][0] recipe_output_item = RecipeItem() recipe_output_item['recipe_name'] = recipe['name'] recipe_output_item['ingredients'] = recipe['ingredients'] recipe_output_item['tags'] = [ tag['properties']['title'] for tag in data['items'][1:] ] try: recipe_output_item['description'] = recipe['description'] except KeyError: recipe_output_item['description'] = None recipe_output_item['url'] = recipe['url'] yield recipe_output_item