def __init__(self, response): mde = MicrodataExtractor() try: self.data = mde.extract(response.body, response.url) except: self.data = mde.extract(response.body.decode('latin-1'), response.url)
def test_w3c_5_2(self): body = get_testdata('w3c', 'microdata.5.2.html') expected = json.loads(get_testdata('w3c', 'microdata.5.2.flat.json').decode('UTF-8')) mde = MicrodataExtractor(nested=False, strict=True) data = mde.extract(body) self.assertDictEqual(data, expected)
def test_w3c_5_2(self): body = get_testdata('w3c', 'microdata.5.2.html') expected = json.loads(get_testdata('w3c', 'microdata.5.2.withtext.json').decode('UTF-8')) mde = MicrodataExtractor(add_text_content=True) data = mde.extract(body) self.assertDictEqual(data, expected)
def test_w3c_5_5(self): body = get_testdata("w3c", "microdata.5.5.html") expected = json.loads(get_testdata("w3c", "microdata.5.5.json").decode("UTF-8")) mde = MicrodataExtractor(strict=True) data = mde.extract(body) self.assertDictEqual(data, expected)
def parse_listing(self, response): mde = MicrodataExtractor() data = mde.extract(response.body)['items'] if data: it = {} it['shop'] = data[0]['properties'] prod = data[1]['properties'] it.update(prod['offerDetails']['properties']) it['name'] = prod['name'] it['url'] = response.url it['properties'] = [x for x in response.css('#item-overview .properties li::text').extract() \ if all(y not in x.lower() for y in ['materials','feedback', 'favorited', 'ships'])] it['materials'] = e0(response.css('#overview-materials::text')) it['origin'] = e0(response.css('.origin::text')) it['imgs'] = response.css( '#image-carousel img::attr("src")').extract() it['description'] = e0(response.css("#description-text")) it['tags'] = response.css('#listing-tag-list li a::text').extract() it['fineprints'] = [ x.strip() for x in response.css('#fineprint li::text').extract()[:4] ] it['rating'] = response.css( '.review-rating meta::attr("content")').extract() #it['html'] = response.body yield it
def parse(self, response): for beacon_page in response: mde = MicrodataExtractor() beacon_data=mde.extract(html_content) yield{ beacon_data }
def parse(self, response): mde = MicrodataExtractor() data = mde.extract(response.body) for item in data: if item['type'] in self.target_types: record = {'indexed_date': datetime.date.today().isoformat(), 'url': response.url, 'body': item} yield record
def test_w3c_7_1(self): body = get_testdata('w3c', 'microdata.7.1.html') expected = json.loads(get_testdata('w3c', 'microdata.7.1.json').decode('UTF-8')) mde = MicrodataExtractor(strict=True) data = mde.extract(body, 'http://blog.example.com/progress-report') self.assertDictEqual(data, expected)
def test_w3c_object_element(self): body = get_testdata('w3c', 'microdata.object.html') expected = json.loads(get_testdata('w3c', 'microdata.object.json').decode('UTF-8')) mde = MicrodataExtractor(strict=True) data = mde.extract(body, 'http://www.example.com/microdata/test') self.assertDictEqual(data, expected)
def test_w3c_data_element(self): body = get_testdata('w3c', 'microdata.4.2.data.html') expected = json.loads(get_testdata('w3c', 'microdata.4.2.data.json').decode('UTF-8')) mde = MicrodataExtractor(strict=True) data = mde.extract(body) self.assertDictEqual(data, expected)
def test_w3c_7_1(self): body = get_testdata("w3c", "microdata.7.1.html") expected = json.loads(get_testdata("w3c", "microdata.7.1.json").decode("UTF-8")) mde = MicrodataExtractor(strict=True) data = mde.extract(body, "http://blog.example.com/progress-report") self.assertDictEqual(data, expected)
def test_w3c_5_2(self): body = get_testdata('w3c', 'microdata.5.2.html') expected = json.loads( get_testdata('w3c', 'microdata.5.2.withtext.json').decode('UTF-8')) mde = MicrodataExtractor(add_text_content=True) data = mde.extract(body) self.assertEqual(data, expected)
def get_microdata_extruct_items(htmltext): mde = MicrodataExtractor() try: items = mde.extract(htmltext) except XMLSyntaxError: return # Nothing to do here return items
def test_join_none(self): body = get_testdata('schema.org', 'product-ref.html') expected = json.loads( get_testdata('schema.org', 'product-ref.json').decode('UTF-8')) mde = MicrodataExtractor() data = mde.extract(body) self.assertEqual(data, expected)
def test_w3c_textContent_values(self): body = get_testdata('w3c', 'microdata.4.2.strings.html') expected = json.loads( get_testdata('w3c', 'microdata.4.2.strings.json').decode('UTF-8')) mde = MicrodataExtractor(strict=True) data = mde.extract(body) self.assertEqual(data, expected)
def test_w3c_5_5(self): body = get_testdata('w3c', 'microdata.5.5.html') expected = json.loads( get_testdata('w3c', 'microdata.5.5.json').decode('UTF-8')) mde = MicrodataExtractor(strict=True) data = mde.extract(body) self.assertEqual(data, expected)
def test_w3c_meter_element(self): body = get_testdata('w3c', 'microdata.4.2.meter.html') expected = json.loads( get_testdata('w3c', 'microdata.4.2.meter.json').decode('UTF-8')) mde = MicrodataExtractor(strict=True) data = mde.extract(body) self.assertDictEqual(data, expected)
def test_w3c_object_element(self): body = get_testdata('w3c', 'microdata.object.html') expected = json.loads( get_testdata('w3c', 'microdata.object.json').decode('UTF-8')) mde = MicrodataExtractor(strict=True) data = mde.extract(body, 'http://www.example.com/microdata/test') self.assertEqual(data, expected)
def test_w3c_7_1(self): body = get_testdata('w3c', 'microdata.7.1.html') expected = json.loads( get_testdata('w3c', 'microdata.7.1.flat.json').decode('UTF-8')) mde = MicrodataExtractor(nested=False, strict=True) data = mde.extract(body, 'http://blog.example.com/progress-report') self.assertEqual(data, expected)
def test_schemaorg_MusicRecording(self): for i in [1]: body = get_testdata('schema.org', 'MusicRecording.{:03d}.html'.format(i)) expected = json.loads(get_testdata('schema.org', 'MusicRecording.{:03d}.json'.format(i)).decode('UTF-8')) mde = MicrodataExtractor() data = mde.extract(body) self.assertDictEqual(data, expected)
def test_schemaorg_Event(self): for i in [1, 2, 3, 4, 8]: body = get_testdata("schema.org", "Event.{:03d}.html".format(i)) expected = json.loads(get_testdata("schema.org", "Event.{:03d}.json".format(i)).decode("UTF-8")) mde = MicrodataExtractor() data = mde.extract(body) self.assertDictEqual(data, expected)
def parse_product(self, response): mde = MicrodataExtractor() data = mde.extract(response.body) product_data = data['items'][1]['properties'] extra_data = {} for l in response.body.split('\n'): if 'vgoogle_ecommProd' in l: line_data = l.strip() key = line_data.split(':')[0].strip().replace( 'vgoogle_ecommProd', '') value = line_data.split(':')[1][3:-3] if key not in extra_data: extra_data[key] = value loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', product_data.get('sku', extra_data['ID'])) loader.add_value('sku', product_data.get('sku', extra_data['ID'])) loader.add_value('name', product_data.get('name', extra_data['Name'])) loader.add_value('url', response.url) if 'price' in product_data: loader.add_value('price', product_data['price']) elif 'Price' in extra_data: loader.add_value('price', extra_data['Price']) else: price = response.xpath('//*[(contains(@class, "product-item") and ' 'contains(@class, "product-price")) or @id="price-amount"]//text()')\ .re(r'[\d\.,]+') loader.add_value('price', price) try: loader.add_value( 'category', data['items'][0]['properties']['itemListElement'] [1]['properties']['name']) except: loader.add_value('category', extra_data['Cat']) loader.add_value('brand', product_data.get('manufacturer', extra_data['Brand'])) try: loader.add_value( 'image_url', response.urljoin( response.xpath( '//div[@id="prod-img-placehold"]/img/@srcset').re( r'(.*\.jpg)')[0].split(',')[-1].strip())) except: pass item = loader.load_item() metadata = SpecSaversMeta() promotional_data = response.xpath( '//div[@class="arrow-container"]/div/text()').extract() metadata['promotion'] = promotional_data[0].strip( ) if promotional_data else '' item['metadata'] = metadata yield item
def test_join_custom_url(self): body = get_testdata('schema.org', 'product.html') expected = json.loads( get_testdata('schema.org', 'product_custom_url.json').decode('UTF-8')) mde = MicrodataExtractor() data = mde.extract(body, base_url='http://some-example.com') self.assertEqual(data, expected)
def parse(self, response): extractor = MicrodataExtractor() items = extractor.extract(response.text, response.url)['items'] for it in items: yield it['properties'] next_page_url = response.css("li.next > a::attr(href)").extract_first() if next_page_url is not None: yield scrapy.Request(response.urljoin(next_page_url))
def parse(self, response): extractor = MicrodataExtractor() properties = extractor.extract(response.body_as_unicode()).get('items')[0].get('properties', {}) item = response.meta.get('item', {}) item['url'] = response.url item['title'] = properties.get('name').replace('Details about', '').strip() item['price'] = float( properties.get('offers', {}).get('properties', {}).get('price', 0) ) yield item
def test_if_punctuations_in_description_are_correctly_formatted(self): body = get_testdata('websites', 'microdata-with-description.html') expected = json.loads( get_testdata('websites', 'microdata-with-description.json').decode('UTF-8')) mde = MicrodataExtractor() data = mde.extract(body) self.assertEqual(data, expected)
def test_schemaorg_Event(self): for i in [1, 2, 3, 4, 8]: body = get_testdata('schema.org', 'Event.{:03d}.html'.format(i)) expected = json.loads( get_testdata('schema.org', 'Event.{:03d}.json'.format(i)).decode('UTF-8')) mde = MicrodataExtractor() data = mde.extract(body) self.assertEqual(data, expected)
def parse_item(self, response): """Parse the recipe to get title and ingredients.""" schema_type = "mde" mde = MicrodataExtractor() data = mde.extract(response.body) # print('response.body:', response.body) # print('data:', data) if len(data) == 0: jslde = JsonLdExtractor() data = jslde.extract(response.body) schema_type = "jsonld" if schema_type == "mde": recipe = data[0]['properties'] # recipe_output_item = RecipeItem() # recipe_output_item['recipe_name'] = recipe['name'] # recipe_output_item['ingredients'] = [ # ingredient for ingredient in recipe['ingredients'] # if ingredient not in ['', 'Add all ingredients to list'] # ] # recipe_output_item['tags'] = [tag['properties']['title'] # for tag in data['items'][1:]] # try: # recipe_output_item['description'] = recipe['description'] # except KeyError: # recipe_output_item['description'] = None # recipe_output_item['url'] = recipe['url'] elif schema_type == "jsonld": recipe = data['items'][0] # recipe_output_item = RecipeItem() # recipe_output_item['recipe_name'] = recipe['name'] # recipe_output_item['ingredients'] = recipe['ingredients'] # recipe_output_item['tags'] = [tag['properties']['title'] # for tag in data['items'][1:]] # try: # recipe_output_item['description'] = recipe['description'] # except KeyError: # recipe_output_item['description'] = None # recipe_output_item['url'] = recipe['url'] properties = [ 'totalTime', 'nutrition', 'name', 'author', 'url', 'image', 'recipeIngredient', 'aggregateRating', 'recipeYield', 'recipeInstructions', 'video', 'mainEntityOfPage', 'cookTime', 'recipeCategory', 'review', 'prepTime', 'description' ] recipe_output_item = RecipeItem() for prop in properties: try: recipe_output_item[prop] = recipe[prop] except KeyError: recipe_output_item[prop] = None yield recipe_output_item
def test_schemaorg_LocalBusiness(self): for i in [2, 3]: body = get_testdata('schema.org', 'LocalBusiness.{:03d}.html'.format(i)) expected = json.loads( get_testdata( 'schema.org', 'LocalBusiness.{:03d}.json'.format(i)).decode('UTF-8')) mde = MicrodataExtractor() data = mde.extract(body) self.assertEqual(data, expected)
def test_schemaorg_MusicRecording(self): for i in [1]: body = get_testdata('schema.org', 'MusicRecording.{:03d}.html'.format(i)) expected = json.loads( get_testdata( 'schema.org', 'MusicRecording.{:03d}.json'.format(i)).decode('UTF-8')) mde = MicrodataExtractor() data = mde.extract(body) self.assertDictEqual(data, expected)
def parse(self, response): mde = MicrodataExtractor() data = mde.extract(response.body) for item in data['items']: if item['type'] in self.target_types: item['indexed_date'] = datetime.date.today().isoformat() item['url'] = response.url yield item for url in response.xpath('//a/@href').extract(): if '/events' in url: yield scrapy.Request(response.urljoin(url), callback=self.parse)
def extract(self, html_text: str, extract_title: bool = False, extract_meta: bool = False, extract_microdata: bool = False, microdata_base_url: str = "", extract_json_ld: bool = False, extract_rdfa: bool = False, rdfa_base_url: str = "") \ -> List[Extraction]: """ Args: html_text (str): input html string to be extracted extract_title (bool): True if string of 'title' tag needs to be extracted, return as { "title": "..." } extract_meta (bool): True if string of 'meta' tags needs to be extracted, return as { "meta": { "author": "...", ...}} extract_microdata (bool): True if microdata needs to be extracted, returns as { "microdata": [...] } microdata_base_url (str): base namespace url for microdata, empty string if no base url is specified extract_json_ld (bool): True if json-ld needs to be extracted, return as { "json-ld": [...] } extract_rdfa (bool): True if rdfs needs to be extracted, returns as { "rdfa": [...] } rdfa_base_url (str): base namespace url for rdfa, empty string if no base url is specified Returns: List[Extraction]: the list of extraction or the empty list if there are no matches. """ res = list() soup = BeautifulSoup(html_text, 'html.parser') if soup.title and extract_title: title = self._wrap_data("title", soup.title.string.encode('utf-8').decode('utf-8')) res.append(title) if soup.title and extract_meta: meta_content = self._wrap_meta_content(soup.find_all("meta")) meta_data = self._wrap_data("meta", meta_content) res.append(meta_data) if extract_microdata: mde = MicrodataExtractor() mde_data = self._wrap_data("microdata", mde.extract(html_text, microdata_base_url)) res.append(mde_data) if extract_json_ld: jslde = JsonLdExtractor() jslde_data = self._wrap_data("json-ld", jslde.extract(html_text)) res.append(jslde_data) if extract_rdfa: rdfae = RDFaExtractor() rdfae_data = self._wrap_data("rdfa", rdfae.extract(html_text, rdfa_base_url)) res.append(rdfae_data) return res
def parse_product(self, response): mde = MicrodataExtractor() try: micro_data = mde.extract(response.body)['items'] gen_data = filter(lambda a: a['type'] == 'http://schema.org/Product', micro_data)[0]['properties'] categories = [c['properties']['title'] for c in filter(lambda d: d['type'] == 'http://data-vocabulary.org/Breadcrumb', micro_data)][1:] except: self.log('WARNING => Wrong product page in %s' % response.url) return main_name = gen_data['name'] if isinstance(main_name, list): main_name = main_name[0] main_brand = gen_data.get('brand', '') if isinstance(main_brand, list): main_brand = main_brand[0] variants = response.xpath('//input[@name="ctl00$cphMain$ctl00$hidProductVariants"]/@value').extract() if variants: data = json.loads(self.html_parser.unescape(variants[0])) for d in data: for var in d['Variants']: for size_data in var['Variants']: color_name = size_data.get('Article', dict()).get('ColorName', '') size_data = size_data['Article'] url = self.product_url % size_data identifier = size_data['ItemOfferId'] name = main_name + ', ' + color_name + ', ' + size_data['FriendlySize'] price = size_data['WebInfo']['ArticlePriceDisplay']['FormattedSalePriceAfterWithCharges'] shipping_cost = size_data['FormattedDeliveryFee'] loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name) loader.add_value('url', url) loader.add_value('identifier', identifier) loader.add_value('sku', size_data['ProductId']) loader.add_value('price', extract_price_eu(price)) if shipping_cost: loader.add_value('shipping_cost', extract_price_eu(shipping_cost)) loader.add_value('image_url', gen_data['image'][-1]) if size_data['AvailabilityCode'] != 'L': loader.add_value('stock', 0) loader.add_value('category', categories) if main_brand: loader.add_value('brand', main_brand) yield loader.load_item() else: self.log('WARNING: Variants not found in => %s' % response.url)
def parse_item(self, response): """Parse the recipe to get title and ingredients.""" schema_type = "mde" mde = MicrodataExtractor() data = mde.extract(response.body) if len(data['items']) == 0: jslde = JsonLdExtractor() data = jslde.extract(response.body) schema_type = "jsonld" if schema_type == "mde": recipe = data['items'][2]['properties'] recipe_output_item = RecipeItem() recipe_output_item['recipe_name'] = recipe['name'] recipe_output_item['ingredients'] = [ ingredient for ingredient in recipe['ingredients'] if ingredient not in ['', 'Add all ingredients to list'] ] recipe_tags = recipe['recipeCategory'] if 'recipeCuisine' in recipe.keys(): recipe_tags.append(recipe['recipeCuisine']) recipe_output_item['tags'] = recipe_tags try: recipe_output_item['description'] = recipe['description'] except KeyError: recipe_output_item['description'] = None recipe_output_item['url'] = recipe['url'] elif schema_type == "jsonld": recipe = data['items'][0] recipe_output_item = RecipeItem() recipe_output_item['recipe_name'] = recipe['name'] recipe_output_item['ingredients'] = recipe['ingredients'] recipe_output_item['tags'] = [ tag['properties']['title'] for tag in data['items'][1:] ] try: recipe_output_item['description'] = recipe['description'] except KeyError: recipe_output_item['description'] = None recipe_output_item['url'] = recipe['url'] yield recipe_output_item
def parse_item(self, response): """Parse the recipe to get title and ingredients.""" schema_type = "mde" mde = MicrodataExtractor() data = mde.extract(response.body) if len(data['items']) == 0: jslde = JsonLdExtractor() data = jslde.extract(response.body) schema_type = "jsonld" if schema_type == "mde": recipe = data['items'][2]['properties'] recipe_output_item = RecipeItem() recipe_output_item['recipe_name'] = recipe['name'] recipe_output_item['ingredients'] = [ ingredient for ingredient in recipe['ingredients'] if ingredient not in ['', 'Add all ingredients to list'] ] recipe_tags = recipe['recipeCategory'] if 'recipeCuisine' in recipe.keys(): recipe_tags.append(recipe['recipeCuisine']) recipe_output_item['tags'] = recipe_tags try: recipe_output_item['description'] = recipe['description'] except KeyError: recipe_output_item['description'] = None recipe_output_item['url'] = recipe['url'] elif schema_type == "jsonld": recipe = data['items'][0] recipe_output_item = RecipeItem() recipe_output_item['recipe_name'] = recipe['name'] recipe_output_item['ingredients'] = recipe['ingredients'] recipe_output_item['tags'] = [tag['properties']['title'] for tag in data['items'][1:]] try: recipe_output_item['description'] = recipe['description'] except KeyError: recipe_output_item['description'] = None recipe_output_item['url'] = recipe['url'] yield recipe_output_item
def parse_listing(self, response): mde = MicrodataExtractor() data = mde.extract(response.body)['items'] if data: it = {} it['shop'] = data[0]['properties'] prod = data[1]['properties'] it.update(prod['offerDetails']['properties']) it['name'] = prod['name'] it['url'] = response.url it['properties'] = [x for x in response.css('#item-overview .properties li::text').extract() \ if all(y not in x.lower() for y in ['materials','feedback', 'favorited', 'ships'])] it['materials'] = e0(response.css('#overview-materials::text')) it['origin'] = e0(response.css('.origin::text')) it['imgs'] = response.css('#image-carousel img::attr("src")').extract() it['description'] = e0(response.css("#description-text")) it['tags'] = response.css('#listing-tag-list li a::text').extract() it['fineprints'] = [x.strip() for x in response.css('#fineprint li::text').extract()[:4]] it['rating'] = response.css('.review-rating meta::attr("content")').extract() #it['html'] = response.body yield it
def extract(self, html_text: str, extract_title: bool = False, extract_meta: bool = False, extract_microdata: bool = False, extract_json_ld: bool = False, extract_rdfa: bool = False) \ -> List[Extraction]: res = list() soup = BeautifulSoup(html_text, 'html.parser') if soup.title and extract_title: title = self.wrap_data( "title", soup.title.string.encode('utf-8').decode('utf-8')) res.append(title) if soup.title and extract_meta: meta_content = self.wrap_meta_content(soup.find_all("meta")) meta_data = self.wrap_data("meta", meta_content) res.append(meta_data) if extract_microdata: mde = MicrodataExtractor() mde_data = self.wrap_data("microdata", mde.extract(html_text)) res.append(mde_data) if extract_json_ld: jslde = JsonLdExtractor() jslde_data = self.wrap_data("json-ld", jslde.extract(html_text)) res.append(jslde_data) if extract_rdfa: rdfae = RDFaExtractor() rdfae_data = self.wrap_data("rdfa", rdfae.extract(html_text)) res.append(rdfae_data) return res
def parse(self, response): selector = Selector(response=response) extractor = MicrodataExtractor() items = extractor.extract(response.body_as_unicode(), response.url) print items
def parse_products(self, response): mde = MicrodataExtractor() data = mde.extract(response.body) category = response.meta['category'] selectors = response.xpath('//div[contains(@id, "Products_")]') products = filter(lambda d: d['type'] == 'http://schema.org/Product', data['items']) for product_data, product_xs in zip(products, selectors): properties = product_data['properties'] try: offer = properties['offers']['properties'] except: self.log('Offers are not found for %s => %s' % (properties['name'], response.url)) continue brand = product_xs.xpath( './/div[@class="Image"]//img[contains(@alt, "View more ")]/@alt' ).re(r'View more (.*) products') product_url = product_xs.xpath( './/div[@class="Info"]//h2/a[contains(@href, "/products/")]/@href' ).extract() if not product_url: self.log('Not product url in => %s' % response.url) continue loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', properties['mpn']) loader.add_value('url', response.urljoin(product_url[0])) loader.add_value('name', properties['name']) loader.add_value( 'price', round( Decimal(offer['price'].replace(',', '')) / Decimal('1.2'), 2)) loader.add_value('sku', properties['mpn']) loader.add_value('category', category) loader.add_value( 'image_url', urljoin_rfc('http://www.broadbandbuyer.com/images/products/', properties['image'])) if brand: loader.add_value('brand', brand[0]) loader.add_value('shipping_cost', '13') in_stock = (offer['availability'] == 'http://schema.org/InStock') if not in_stock: loader.add_value('stock', 0) else: stock_no = product_xs.xpath( './/div[@class="Info"]//span[@class="Stock3"]/text()').re( r'(\d+)') if stock_no: loader.add_value('stock', stock_no[0]) item = loader.load_item() self.yield_item(item) page_urls = set( response.xpath( '//div[@class="pages"]/a[not(contains(@class, "active"))' ' and contains(@href, "page=")]/@href').extract()) for url in page_urls: yield Request(response.urljoin(url), callback=self.parse_products, meta={ 'cookiejar': response.meta['cookiejar'], 'category': response.meta['category'] })
def get_review_items_from_microdata(spider, review_type, response, product, reviews_xpath=None, pros_xpath=None, cons_xpath=None): ''' Get all reviews from a page, useful for user review pages with microdata :param spider: the spider we use to scrape the site :param review_type: type of the reviews to scrape, should be either USER or PRO :param response: an instance of Scrapy's Response object where reviews will be scraped from :param product: the product item the reviews are written for :param reviews_xpath: the xpath to extract review selectors from 'response' :param pros_xpath: the xpath to extract pros from review selectors :param cons_xpath: the xpath to extract cons from review selectors :return: list of all review items extracted ''' mde = MicrodataExtractor() try: items = mde.extract(response.text) except XMLSyntaxError: return [] # Nothing to do here... all_review_extracts = [ i for i in items if i['type'] == "http://schema.org/Review" ] all_pros = [] all_cons = [] if reviews_xpath: add_pros_and_cons = True all_reviews = response.xpath(reviews_xpath) for single_review in all_reviews: if pros_xpath: pros = spider.extract_all(single_review.xpath(pros_xpath), separator=' ; ') else: pros = '' if cons_xpath: cons = spider.extract_all(single_review.xpath(cons_xpath), separator=' ; ') else: cons = '' all_pros.append(pros) all_cons.append(cons) if len(all_pros) != len(all_review_extracts) or len(all_cons) != len( all_review_extracts): spider.logger.warning( "Number of reviews extracted from xpath is different from number of review microdata." ) add_pros_and_cons = False else: add_pros_and_cons = False review_items = [] for index, item in enumerate(all_review_extracts): if add_pros_and_cons: review = review_microdata_extruct(item, product=product, tp=review_type, pros=all_pros[index], cons=all_cons[index]) else: review = review_microdata_extruct(item, product=product, tp=review_type) review_items.append(review) return review_items
class RISJMetadataExtractor(object): """An extruct-based metadata extractor""" # TODO: Extend to microdata and RDFa, replacing bespoke xpath code. Then # test on body of crawlers! def __init__(self, response, microdata=False, jsonld=False, rdfa=False): self.response = response self.microdata = microdata self.jsonld = jsonld self.rdfa = rdfa if rdfa: try: self.rdfae = RDFaExtractor() self.rdfadata = self.rdfae.extract(self.response.text, url=self.response.url) except JSONDecodeError: pass if microdata: try: self.mde = MicrodataExtractor() self.mdedata = self.mde.extract(self.response.text) except JSONDecodeError: pass if jsonld: try: self.jlde = JsonLdExtractor() self.jldata = self.jlde.extract(self.response.text) except (JSONDecodeError, TypeError): self.jldata = [] finally: # Sometimes we get this in the meta dict from RISJExtractJSONLD self.jldata.extend(self.response.meta.get('json-ld', [])) def extract_newsarticle_schemaorg(self, microdata=None, jsonld=None, rdfa=None): """Extract schema.org NewsArticle metadata, encoded using any supported metadata format. Note that we only try to extract the *first* block of NewsArticle data for each method (which is then combined with the first extracted from other methods if more than one is selected.""" if microdata is None: microdata = self.microdata if jsonld is None: jsonld = self.jsonld if rdfa is None: rdfa = self.rdfa outd = {} if jsonld: for d in self.jldata: # logger.debug('Analysing JSON-LD data: '+pformat(d)) try: if (re.match(r'https?://schema.org/?', d['@context']) and d['@type'] == 'NewsArticle'): outd.update(d) except (KeyError, TypeError): continue if microdata: for d in self.mdedata: logger.debug('Analysing W3C microdata: ' + pformat(d)) if re.match(r'https?://schema.org/NewsArticle/?', d.get('type', '')): outd.update(d) if rdfa: raise NotImplementedError # logger.debug('Returning schema.org NewsArticle: '+pformat(outd)) return outd