示例#1
0
def metadata_from_url(url, microdata=True, jsonld=True, rdfa=True):
    resp = requests.get(url, timeout=30)
    result = {
        'url': url,
        'status': '{} {}'.format(resp.status_code, resp.reason)
    }
    try:
        resp.raise_for_status()
    except requests.exceptions.HTTPError:
        return result

    parser = XmlDomHTMLParser(encoding=resp.encoding)
    tree = lxml.html.fromstring(resp.content, parser=parser)

    if microdata:
        mde = MicrodataExtractor(nested=True)
        result['microdata'] = mde.extract_items(tree, resp.url)

    if jsonld:
        jsonlde = JsonLdExtractor()
        result['json-ld'] = jsonlde.extract_items(tree, resp.url)

    if rdfa:
        rdfae = RDFaExtractor()
        result['rdfa'] = rdfae.extract_items(tree, resp.url)

    return result
 def parse(self, response):
     mde = MicrodataExtractor()
     data = mde.extract(response.body)
     for item in data:
         if item['type'] in self.target_types:
             record = {'indexed_date': datetime.date.today().isoformat(), 'url': response.url, 'body': item}
             yield record
示例#3
0
    def parse_item(self, response):
        items = []
        def microdata2jsonld(md):
            if md.get('properties'):
                item = md['properties']
                item['@type'] = md.get('type')
                return item
        items += map(microdata2jsonld, MicrodataExtractor().extract(
            response.body_as_unicode(), response.url)['items'])
        items += JsonLdExtractor().extract(
            response.body_as_unicode(), response.url)['items']

        if not items:
            self.logger.debug("No Microdata items found for %s", response.url)

        self.logger.debug("Checking URL for item: %s" , items)

        for item in items:
            if not item or not item.get('url'):
                self.logger.debug("No URL for item: %s" , item)
                continue

            if item['url'] != response.url:
                self.logger.debug("Not in main URL, go there..")
                yield Request(item['url'], callback=self.parse_item)
            else:
                item['@type'] = item.get('type')
                self.logger.debug("Parsed microdata: %s" % item)
                yield item
示例#4
0
    def __init__(self, response, microdata=False, jsonld=False, rdfa=False):
        self.response = response
        self.microdata = microdata
        self.jsonld = jsonld
        self.rdfa = rdfa

        if rdfa:
            try:
                self.rdfae = RDFaExtractor()
                self.rdfadata = self.rdfae.extract(self.response.text,
                                                   url=self.response.url)
            except JSONDecodeError:
                pass
        if microdata:
            try:
                self.mde = MicrodataExtractor()
                self.mdedata = self.mde.extract(self.response.text)
            except JSONDecodeError:
                pass
        if jsonld:
            try:
                self.jlde = JsonLdExtractor()
                self.jldata = self.jlde.extract(self.response.text)
            except (JSONDecodeError, TypeError):
                self.jldata = []
            finally:
                # Sometimes we get this in the meta dict from RISJExtractJSONLD
                self.jldata.extend(self.response.meta.get('json-ld', []))
示例#5
0
 def parse_listing(self, response):
     mde = MicrodataExtractor()
     data = mde.extract(response.body)['items']
     if data:
         it = {}
         it['shop'] = data[0]['properties']
         prod = data[1]['properties']
         it.update(prod['offerDetails']['properties'])
         it['name'] = prod['name']
         it['url'] = response.url
         it['properties'] = [x for x in response.css('#item-overview .properties li::text').extract() \
             if all(y not in x.lower() for y in ['materials','feedback', 'favorited', 'ships'])]
         it['materials'] = e0(response.css('#overview-materials::text'))
         it['origin'] = e0(response.css('.origin::text'))
         it['imgs'] = response.css(
             '#image-carousel img::attr("src")').extract()
         it['description'] = e0(response.css("#description-text"))
         it['tags'] = response.css('#listing-tag-list li a::text').extract()
         it['fineprints'] = [
             x.strip()
             for x in response.css('#fineprint li::text').extract()[:4]
         ]
         it['rating'] = response.css(
             '.review-rating meta::attr("content")').extract()
         #it['html'] = response.body
         yield it
示例#6
0
 def __init__(self, response):
     mde = MicrodataExtractor()
     try:
         self.data = mde.extract(response.body, response.url)
     except:
         self.data = mde.extract(response.body.decode('latin-1'),
                                 response.url)
示例#7
0
 def parse(self, response):
     for beacon_page in response:
         mde = MicrodataExtractor()
         beacon_data=mde.extract(html_content)
         yield{
             beacon_data
         }
示例#8
0
    def test_w3c_meter_element(self):
        body = get_testdata('w3c', 'microdata.4.2.meter.html')
        expected = json.loads(
            get_testdata('w3c', 'microdata.4.2.meter.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body)
        self.assertDictEqual(data, expected)
示例#9
0
    def test_w3c_object_element(self):
        body = get_testdata('w3c', 'microdata.object.html')
        expected = json.loads(
            get_testdata('w3c', 'microdata.object.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body, 'http://www.example.com/microdata/test')
        self.assertEqual(data, expected)
示例#10
0
    def test_w3c_7_1(self):
        body = get_testdata('w3c', 'microdata.7.1.html')
        expected = json.loads(
            get_testdata('w3c', 'microdata.7.1.flat.json').decode('UTF-8'))

        mde = MicrodataExtractor(nested=False, strict=True)
        data = mde.extract(body, 'http://blog.example.com/progress-report')
        self.assertEqual(data, expected)
示例#11
0
    def test_w3c_5_2(self):
        body = get_testdata('w3c', 'microdata.5.2.html')
        expected = json.loads(
            get_testdata('w3c', 'microdata.5.2.withtext.json').decode('UTF-8'))

        mde = MicrodataExtractor(add_text_content=True)
        data = mde.extract(body)
        self.assertEqual(data, expected)
示例#12
0
def get_microdata_extruct_items(htmltext):
    mde = MicrodataExtractor()
    try:
        items = mde.extract(htmltext)
    except XMLSyntaxError:
        return  # Nothing to do here

    return items
示例#13
0
    def test_join_none(self):
        body = get_testdata('schema.org', 'product-ref.html')
        expected = json.loads(
            get_testdata('schema.org', 'product-ref.json').decode('UTF-8'))

        mde = MicrodataExtractor()
        data = mde.extract(body)
        self.assertEqual(data, expected)
示例#14
0
    def test_w3c_textContent_values(self):
        body = get_testdata('w3c', 'microdata.4.2.strings.html')
        expected = json.loads(
            get_testdata('w3c', 'microdata.4.2.strings.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body)
        self.assertEqual(data, expected)
示例#15
0
    def test_w3c_5_5(self):
        body = get_testdata('w3c', 'microdata.5.5.html')
        expected = json.loads(
            get_testdata('w3c', 'microdata.5.5.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body)
        self.assertEqual(data, expected)
示例#16
0
    def parse_product(self, response):
        mde = MicrodataExtractor()
        data = mde.extract(response.body)

        product_data = data['items'][1]['properties']

        extra_data = {}
        for l in response.body.split('\n'):
            if 'vgoogle_ecommProd' in l:
                line_data = l.strip()
                key = line_data.split(':')[0].strip().replace(
                    'vgoogle_ecommProd', '')
                value = line_data.split(':')[1][3:-3]
                if key not in extra_data:
                    extra_data[key] = value

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('identifier',
                         product_data.get('sku', extra_data['ID']))
        loader.add_value('sku', product_data.get('sku', extra_data['ID']))
        loader.add_value('name', product_data.get('name', extra_data['Name']))
        loader.add_value('url', response.url)
        if 'price' in product_data:
            loader.add_value('price', product_data['price'])
        elif 'Price' in extra_data:
            loader.add_value('price', extra_data['Price'])
        else:
            price = response.xpath('//*[(contains(@class, "product-item") and '
                                   'contains(@class, "product-price")) or @id="price-amount"]//text()')\
                            .re(r'[\d\.,]+')
            loader.add_value('price', price)
        try:
            loader.add_value(
                'category', data['items'][0]['properties']['itemListElement']
                [1]['properties']['name'])
        except:
            loader.add_value('category', extra_data['Cat'])
        loader.add_value('brand',
                         product_data.get('manufacturer', extra_data['Brand']))
        try:
            loader.add_value(
                'image_url',
                response.urljoin(
                    response.xpath(
                        '//div[@id="prod-img-placehold"]/img/@srcset').re(
                            r'(.*\.jpg)')[0].split(',')[-1].strip()))
        except:
            pass

        item = loader.load_item()

        metadata = SpecSaversMeta()
        promotional_data = response.xpath(
            '//div[@class="arrow-container"]/div/text()').extract()
        metadata['promotion'] = promotional_data[0].strip(
        ) if promotional_data else ''
        item['metadata'] = metadata
        yield item
    def parse(self, response):
        extractor = MicrodataExtractor()
        items = extractor.extract(response.text, response.url)['items']
        for it in items:
            yield it['properties']

        next_page_url = response.css("li.next > a::attr(href)").extract_first()
        if next_page_url is not None:
            yield scrapy.Request(response.urljoin(next_page_url))
示例#18
0
def extract(htmlstring, url='http://www.example.com/', encoding="UTF-8"):
    domparser = XmlDomHTMLParser(encoding=encoding)
    tree = fromstring(htmlstring, parser=domparser)
    return {
        name: extractor.extract_items(tree, url=url)
        for name, extractor in (('json-ld', JsonLdExtractor()),
                                ('microdata', MicrodataExtractor()),
                                ('rdfa', RDFaExtractor()))
    }
示例#19
0
    def test_join_custom_url(self):
        body = get_testdata('schema.org', 'product.html')
        expected = json.loads(
            get_testdata('schema.org',
                         'product_custom_url.json').decode('UTF-8'))

        mde = MicrodataExtractor()
        data = mde.extract(body, base_url='http://some-example.com')
        self.assertEqual(data, expected)
示例#20
0
 def parse(self, response):
     extractor = MicrodataExtractor()
     properties = extractor.extract(response.body_as_unicode()).get('items')[0].get('properties', {})
     item = response.meta.get('item', {})
     item['url'] = response.url
     item['title'] = properties.get('name').replace('Details about', '').strip()
     item['price'] = float(
         properties.get('offers', {}).get('properties', {}).get('price', 0)
     )
     yield item
示例#21
0
    def test_schemaorg_Event(self):
        for i in [1, 2, 3, 4, 8]:
            body = get_testdata('schema.org', 'Event.{:03d}.html'.format(i))
            expected = json.loads(
                get_testdata('schema.org',
                             'Event.{:03d}.json'.format(i)).decode('UTF-8'))

            mde = MicrodataExtractor()
            data = mde.extract(body)
            self.assertEqual(data, expected)
示例#22
0
    def test_if_punctuations_in_description_are_correctly_formatted(self):
        body = get_testdata('websites', 'microdata-with-description.html')
        expected = json.loads(
            get_testdata('websites',
                         'microdata-with-description.json').decode('UTF-8'))

        mde = MicrodataExtractor()
        data = mde.extract(body)

        self.assertEqual(data, expected)
示例#23
0
    def parse_item(self, response):
        """Parse the recipe to get title and ingredients."""
        schema_type = "mde"
        mde = MicrodataExtractor()
        data = mde.extract(response.body)
        # print('response.body:', response.body)
        # print('data:', data)
        if len(data) == 0:
            jslde = JsonLdExtractor()
            data = jslde.extract(response.body)
            schema_type = "jsonld"

        if schema_type == "mde":
            recipe = data[0]['properties']
            # recipe_output_item = RecipeItem()
            # recipe_output_item['recipe_name'] = recipe['name']
            # recipe_output_item['ingredients'] = [
            #     ingredient for ingredient in recipe['ingredients']
            #     if ingredient not in ['', 'Add all ingredients to list']
            # ]
            # recipe_output_item['tags'] = [tag['properties']['title']
            #                               for tag in data['items'][1:]]
            # try:
            #   recipe_output_item['description'] = recipe['description']
            # except KeyError:
            #   recipe_output_item['description'] = None
            # recipe_output_item['url'] = recipe['url']
        elif schema_type == "jsonld":
            recipe = data['items'][0]
            # recipe_output_item = RecipeItem()
            # recipe_output_item['recipe_name'] = recipe['name']
            # recipe_output_item['ingredients'] = recipe['ingredients']
            # recipe_output_item['tags'] = [tag['properties']['title']
            #                               for tag in data['items'][1:]]
            # try:
            #   recipe_output_item['description'] = recipe['description']
            # except KeyError:
            #   recipe_output_item['description'] = None
            # recipe_output_item['url'] = recipe['url']

        properties = [
            'totalTime', 'nutrition', 'name', 'author', 'url', 'image',
            'recipeIngredient', 'aggregateRating', 'recipeYield',
            'recipeInstructions', 'video', 'mainEntityOfPage', 'cookTime',
            'recipeCategory', 'review', 'prepTime', 'description'
        ]
        recipe_output_item = RecipeItem()
        for prop in properties:
            try:
                recipe_output_item[prop] = recipe[prop]
            except KeyError:
                recipe_output_item[prop] = None

        yield recipe_output_item
示例#24
0
    def test_schemaorg_LocalBusiness(self):
        for i in [2, 3]:
            body = get_testdata('schema.org',
                                'LocalBusiness.{:03d}.html'.format(i))
            expected = json.loads(
                get_testdata(
                    'schema.org',
                    'LocalBusiness.{:03d}.json'.format(i)).decode('UTF-8'))

            mde = MicrodataExtractor()
            data = mde.extract(body)
            self.assertEqual(data, expected)
示例#25
0
    def test_schemaorg_MusicRecording(self):
        for i in [1]:
            body = get_testdata('schema.org',
                                'MusicRecording.{:03d}.html'.format(i))
            expected = json.loads(
                get_testdata(
                    'schema.org',
                    'MusicRecording.{:03d}.json'.format(i)).decode('UTF-8'))

            mde = MicrodataExtractor()
            data = mde.extract(body)
            self.assertDictEqual(data, expected)
示例#26
0
    def parse(self, response):
        mde = MicrodataExtractor()
        data = mde.extract(response.body)
        for item in data['items']:
            if item['type'] in self.target_types:
                item['indexed_date'] = datetime.date.today().isoformat()
                item['url'] = response.url
                yield item

        for url in response.xpath('//a/@href').extract():
            if '/events' in url:
                yield scrapy.Request(response.urljoin(url),
                                     callback=self.parse)
示例#27
0
    def extract(self, html_text: str,
                extract_title: bool = False,
                extract_meta: bool = False,
                extract_microdata: bool = False,
                microdata_base_url: str = "",
                extract_json_ld: bool = False,
                extract_rdfa: bool = False,
                rdfa_base_url: str = "") \
            -> List[Extraction]:
        """
        Args:
            html_text (str): input html string to be extracted
            extract_title (bool): True if string of 'title' tag needs to be extracted, return as { "title": "..." }
            extract_meta (bool): True if string of 'meta' tags needs to be extracted, return as { "meta": { "author": "...", ...}}
            extract_microdata (bool): True if microdata needs to be extracted, returns as { "microdata": [...] }
            microdata_base_url (str): base namespace url for microdata, empty string if no base url is specified
            extract_json_ld (bool): True if json-ld needs to be extracted, return as { "json-ld": [...] }
            extract_rdfa (bool): True if rdfs needs to be extracted, returns as { "rdfa": [...] }
            rdfa_base_url (str): base namespace url for rdfa, empty string if no base url is specified

        Returns:
            List[Extraction]: the list of extraction or the empty list if there are no matches.
        """
        res = list()
        soup = BeautifulSoup(html_text, 'html.parser')

        if soup.title and extract_title:
            title = self._wrap_data("title", soup.title.string.encode('utf-8').decode('utf-8'))
            res.append(title)

        if soup.title and extract_meta:
            meta_content = self._wrap_meta_content(soup.find_all("meta"))
            meta_data = self._wrap_data("meta", meta_content)
            res.append(meta_data)

        if extract_microdata:
            mde = MicrodataExtractor()
            mde_data = self._wrap_data("microdata", mde.extract(html_text, microdata_base_url))
            res.append(mde_data)

        if extract_json_ld:
            jslde = JsonLdExtractor()
            jslde_data = self._wrap_data("json-ld", jslde.extract(html_text))
            res.append(jslde_data)

        if extract_rdfa:
            rdfae = RDFaExtractor()
            rdfae_data = self._wrap_data("rdfa", rdfae.extract(html_text, rdfa_base_url))
            res.append(rdfae_data)

        return res
示例#28
0
    def parse_product(self, response):
        mde = MicrodataExtractor()
        try:
            micro_data = mde.extract(response.body)['items']
            gen_data = filter(lambda a: a['type'] == 'http://schema.org/Product',
                              micro_data)[0]['properties']
            categories = [c['properties']['title']
                          for c in filter(lambda d: d['type'] == 'http://data-vocabulary.org/Breadcrumb',
                                          micro_data)][1:]
        except:
            self.log('WARNING => Wrong product page in %s' % response.url)
            return

        main_name = gen_data['name']
        if isinstance(main_name, list):
            main_name = main_name[0]
        main_brand = gen_data.get('brand', '')
        if isinstance(main_brand, list):
            main_brand = main_brand[0]

        variants = response.xpath('//input[@name="ctl00$cphMain$ctl00$hidProductVariants"]/@value').extract()
        if variants:
            data = json.loads(self.html_parser.unescape(variants[0]))
            for d in data:
                for var in d['Variants']:
                    for size_data in var['Variants']:
                        color_name = size_data.get('Article', dict()).get('ColorName', '')
                        size_data = size_data['Article']
                        url = self.product_url % size_data
                        identifier = size_data['ItemOfferId']
                        name = main_name + ', ' + color_name + ', ' + size_data['FriendlySize']
                        price = size_data['WebInfo']['ArticlePriceDisplay']['FormattedSalePriceAfterWithCharges']
                        shipping_cost = size_data['FormattedDeliveryFee']
                        loader = ProductLoader(item=Product(), response=response)
                        loader.add_value('name', name)
                        loader.add_value('url', url)
                        loader.add_value('identifier', identifier)
                        loader.add_value('sku', size_data['ProductId'])
                        loader.add_value('price', extract_price_eu(price))
                        if shipping_cost:
                            loader.add_value('shipping_cost', extract_price_eu(shipping_cost))
                        loader.add_value('image_url', gen_data['image'][-1])
                        if size_data['AvailabilityCode'] != 'L':
                            loader.add_value('stock', 0)
                        loader.add_value('category', categories)
                        if main_brand:
                            loader.add_value('brand', main_brand)
                        yield loader.load_item()
        else:
            self.log('WARNING: Variants not found in => %s' % response.url)
示例#29
0
def async_extruct(url, microdata=True, jsonld=True):
    response.content_type = 'application/json'
    resp = requests.get(url, timeout=30)

    parser = lxml.html.HTMLParser(encoding=resp.encoding)
    lxmldoc = lxml.html.fromstring(resp.content, parser=parser)

    result = {'url': url, 'status': 'ok'}

    if microdata:
        mde = MicrodataExtractor(nested=True)
        result['microdata'] = mde.extract_items(lxmldoc, url)

    if jsonld:
        jsonlde = JsonLdExtractor()
        result['json-ld'] = jsonlde.extract_items(lxmldoc)

    return result
    def parse_item(self, response):
        """Parse the recipe to get title and ingredients."""
        schema_type = "mde"
        mde = MicrodataExtractor()
        data = mde.extract(response.body)
        if len(data['items']) == 0:
            jslde = JsonLdExtractor()
            data = jslde.extract(response.body)
            schema_type = "jsonld"

        if schema_type == "mde":
            recipe = data['items'][2]['properties']
            recipe_output_item = RecipeItem()
            recipe_output_item['recipe_name'] = recipe['name']
            recipe_output_item['ingredients'] = [
                ingredient for ingredient in recipe['ingredients']
                if ingredient not in ['', 'Add all ingredients to list']
            ]
            recipe_tags = recipe['recipeCategory']
            if 'recipeCuisine' in recipe.keys():
                recipe_tags.append(recipe['recipeCuisine'])
            recipe_output_item['tags'] = recipe_tags
            try:
                recipe_output_item['description'] = recipe['description']
            except KeyError:
                recipe_output_item['description'] = None
            recipe_output_item['url'] = recipe['url']
        elif schema_type == "jsonld":
            recipe = data['items'][0]
            recipe_output_item = RecipeItem()
            recipe_output_item['recipe_name'] = recipe['name']
            recipe_output_item['ingredients'] = recipe['ingredients']
            recipe_output_item['tags'] = [
                tag['properties']['title'] for tag in data['items'][1:]
            ]
            try:
                recipe_output_item['description'] = recipe['description']
            except KeyError:
                recipe_output_item['description'] = None
            recipe_output_item['url'] = recipe['url']

        yield recipe_output_item