示例#1
0
    def test_w3c_object_element(self):
        body = get_testdata('w3c', 'microdata.object.html')
        expected = json.loads(get_testdata('w3c', 'microdata.object.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body, 'http://www.example.com/microdata/test')
        self.assertDictEqual(data, expected)
示例#2
0
    def test_w3c_5_5(self):
        body = get_testdata("w3c", "microdata.5.5.html")
        expected = json.loads(get_testdata("w3c", "microdata.5.5.json").decode("UTF-8"))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body)
        self.assertDictEqual(data, expected)
示例#3
0
    def test_w3c_7_1(self):
        body = get_testdata("w3c", "microdata.7.1.html")
        expected = json.loads(get_testdata("w3c", "microdata.7.1.json").decode("UTF-8"))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body, "http://blog.example.com/progress-report")
        self.assertDictEqual(data, expected)
示例#4
0
    def test_w3c_7_1(self):
        body = get_testdata('w3c', 'microdata.7.1.html')
        expected = json.loads(get_testdata('w3c', 'microdata.7.1.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body, 'http://blog.example.com/progress-report')
        self.assertDictEqual(data, expected)
示例#5
0
    def test_w3c_data_element(self):
        body = get_testdata('w3c', 'microdata.4.2.data.html')
        expected = json.loads(get_testdata('w3c', 'microdata.4.2.data.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body)
        self.assertDictEqual(data, expected)
示例#6
0
    def test_w3c_5_2(self):
        body = get_testdata('w3c', 'microdata.5.2.html')
        expected = json.loads(get_testdata('w3c', 'microdata.5.2.withtext.json').decode('UTF-8'))

        mde = MicrodataExtractor(add_text_content=True)
        data = mde.extract(body)
        self.assertDictEqual(data, expected)
示例#7
0
    def test_w3c_5_2(self):
        body = get_testdata('w3c', 'microdata.5.2.html')
        expected = json.loads(get_testdata('w3c', 'microdata.5.2.flat.json').decode('UTF-8'))

        mde = MicrodataExtractor(nested=False, strict=True)
        data = mde.extract(body)
        self.assertDictEqual(data, expected)
示例#8
0
    def test_w3c_object_element(self):
        body = get_testdata('w3c', 'microdata.object.html')
        expected = json.loads(
            get_testdata('w3c', 'microdata.object.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body, 'http://www.example.com/microdata/test')
        self.assertEqual(data, expected)
示例#9
0
    def test_w3c_meter_element(self):
        body = get_testdata('w3c', 'microdata.4.2.meter.html')
        expected = json.loads(
            get_testdata('w3c', 'microdata.4.2.meter.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body)
        self.assertDictEqual(data, expected)
示例#10
0
    def test_w3c_5_2(self):
        body = get_testdata('w3c', 'microdata.5.2.html')
        expected = json.loads(
            get_testdata('w3c', 'microdata.5.2.withtext.json').decode('UTF-8'))

        mde = MicrodataExtractor(add_text_content=True)
        data = mde.extract(body)
        self.assertEqual(data, expected)
示例#11
0
    def test_join_none(self):
        body = get_testdata('schema.org', 'product.html')
        expected = json.loads(
            get_testdata('schema.org', 'product.json').decode('UTF-8'))

        mde = MicrodataExtractor()
        data = mde.extract(body)
        self.assertEqual(data, expected)
示例#12
0
    def test_w3c_textContent_values(self):
        body = get_testdata('w3c', 'microdata.4.2.strings.html')
        expected = json.loads(
            get_testdata('w3c', 'microdata.4.2.strings.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body)
        self.assertEqual(data, expected)
示例#13
0
    def test_w3c_5_3(self):
        body = get_testdata('w3c', 'microdata.5.3.html')
        expected = json.loads(
            get_testdata('w3c', 'microdata.5.3.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body)
        self.assertEqual(data, expected)
示例#14
0
    def test_schemaorg_MusicRecording(self):
        for i in [1]:
            body = get_testdata('schema.org', 'MusicRecording.{:03d}.html'.format(i))
            expected = json.loads(get_testdata('schema.org', 'MusicRecording.{:03d}.json'.format(i)).decode('UTF-8'))

            mde = MicrodataExtractor()
            data = mde.extract(body)
            self.assertDictEqual(data, expected)
示例#15
0
    def test_w3c_7_1(self):
        body = get_testdata('w3c', 'microdata.7.1.html')
        expected = json.loads(
            get_testdata('w3c', 'microdata.7.1.flat.json').decode('UTF-8'))

        mde = MicrodataExtractor(nested=False, strict=True)
        data = mde.extract(body, 'http://blog.example.com/progress-report')
        self.assertEqual(data, expected)
示例#16
0
    def test_schemaorg_Event(self):
        for i in [1, 2, 3, 4, 8]:
            body = get_testdata("schema.org", "Event.{:03d}.html".format(i))
            expected = json.loads(get_testdata("schema.org", "Event.{:03d}.json".format(i)).decode("UTF-8"))

            mde = MicrodataExtractor()
            data = mde.extract(body)
            self.assertDictEqual(data, expected)
示例#17
0
    def parse_product(self, response):
        mde = MicrodataExtractor()
        data = mde.extract(response.body)

        product_data = data['items'][1]['properties']

        extra_data = {}
        for l in response.body.split('\n'):
            if 'vgoogle_ecommProd' in l:
                line_data = l.strip()
                key = line_data.split(':')[0].strip().replace(
                    'vgoogle_ecommProd', '')
                value = line_data.split(':')[1][3:-3]
                if key not in extra_data:
                    extra_data[key] = value

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('identifier',
                         product_data.get('sku', extra_data['ID']))
        loader.add_value('sku', product_data.get('sku', extra_data['ID']))
        loader.add_value('name', product_data.get('name', extra_data['Name']))
        loader.add_value('url', response.url)
        if 'price' in product_data:
            loader.add_value('price', product_data['price'])
        elif 'Price' in extra_data:
            loader.add_value('price', extra_data['Price'])
        else:
            price = response.xpath('//*[(contains(@class, "product-item") and '
                                   'contains(@class, "product-price")) or @id="price-amount"]//text()')\
                            .re(r'[\d\.,]+')
            loader.add_value('price', price)
        try:
            loader.add_value(
                'category', data['items'][0]['properties']['itemListElement']
                [1]['properties']['name'])
        except:
            loader.add_value('category', extra_data['Cat'])
        loader.add_value('brand',
                         product_data.get('manufacturer', extra_data['Brand']))
        try:
            loader.add_value(
                'image_url',
                response.urljoin(
                    response.xpath(
                        '//div[@id="prod-img-placehold"]/img/@srcset').re(
                            r'(.*\.jpg)')[0].split(',')[-1].strip()))
        except:
            pass

        item = loader.load_item()

        metadata = SpecSaversMeta()
        promotional_data = response.xpath(
            '//div[@class="arrow-container"]/div/text()').extract()
        metadata['promotion'] = promotional_data[0].strip(
        ) if promotional_data else ''
        item['metadata'] = metadata
        yield item
示例#18
0
    def test_join_custom_url(self):
        body = get_testdata('schema.org', 'product.html')
        expected = json.loads(
            get_testdata('schema.org',
                         'product_custom_url.json').decode('UTF-8'))

        mde = MicrodataExtractor()
        data = mde.extract(body, base_url='http://some-example.com')
        self.assertEqual(data, expected)
    def parse(self, response):
        extractor = MicrodataExtractor()
        items = extractor.extract(response.text, response.url)['items']
        for it in items:
            yield it['properties']

        next_page_url = response.css("li.next > a::attr(href)").extract_first()
        if next_page_url is not None:
            yield scrapy.Request(response.urljoin(next_page_url))
示例#20
0
 def parse(self, response):
     extractor = MicrodataExtractor()
     properties = extractor.extract(response.body_as_unicode()).get('items')[0].get('properties', {})
     item = response.meta.get('item', {})
     item['url'] = response.url
     item['title'] = properties.get('name').replace('Details about', '').strip()
     item['price'] = float(
         properties.get('offers', {}).get('properties', {}).get('price', 0)
     )
     yield item
示例#21
0
    def test_if_punctuations_in_description_are_correctly_formatted(self):
        body = get_testdata('websites', 'microdata-with-description.html')
        expected = json.loads(
            get_testdata('websites',
                         'microdata-with-description.json').decode('UTF-8'))

        mde = MicrodataExtractor()
        data = mde.extract(body)

        self.assertEqual(data, expected)
示例#22
0
    def test_schemaorg_Event(self):
        for i in [1, 2, 3, 4, 8]:
            body = get_testdata('schema.org', 'Event.{:03d}.html'.format(i))
            expected = json.loads(
                get_testdata('schema.org',
                             'Event.{:03d}.json'.format(i)).decode('UTF-8'))

            mde = MicrodataExtractor()
            data = mde.extract(body)
            self.assertEqual(data, expected)
示例#23
0
    def parse_item(self, response):
        """Parse the recipe to get title and ingredients."""
        schema_type = "mde"
        mde = MicrodataExtractor()
        data = mde.extract(response.body)
        # print('response.body:', response.body)
        # print('data:', data)
        if len(data) == 0:
            jslde = JsonLdExtractor()
            data = jslde.extract(response.body)
            schema_type = "jsonld"

        if schema_type == "mde":
            recipe = data[0]['properties']
            # recipe_output_item = RecipeItem()
            # recipe_output_item['recipe_name'] = recipe['name']
            # recipe_output_item['ingredients'] = [
            #     ingredient for ingredient in recipe['ingredients']
            #     if ingredient not in ['', 'Add all ingredients to list']
            # ]
            # recipe_output_item['tags'] = [tag['properties']['title']
            #                               for tag in data['items'][1:]]
            # try:
            #   recipe_output_item['description'] = recipe['description']
            # except KeyError:
            #   recipe_output_item['description'] = None
            # recipe_output_item['url'] = recipe['url']
        elif schema_type == "jsonld":
            recipe = data['items'][0]
            # recipe_output_item = RecipeItem()
            # recipe_output_item['recipe_name'] = recipe['name']
            # recipe_output_item['ingredients'] = recipe['ingredients']
            # recipe_output_item['tags'] = [tag['properties']['title']
            #                               for tag in data['items'][1:]]
            # try:
            #   recipe_output_item['description'] = recipe['description']
            # except KeyError:
            #   recipe_output_item['description'] = None
            # recipe_output_item['url'] = recipe['url']

        properties = [
            'totalTime', 'nutrition', 'name', 'author', 'url', 'image',
            'recipeIngredient', 'aggregateRating', 'recipeYield',
            'recipeInstructions', 'video', 'mainEntityOfPage', 'cookTime',
            'recipeCategory', 'review', 'prepTime', 'description'
        ]
        recipe_output_item = RecipeItem()
        for prop in properties:
            try:
                recipe_output_item[prop] = recipe[prop]
            except KeyError:
                recipe_output_item[prop] = None

        yield recipe_output_item
示例#24
0
    def test_schemaorg_LocalBusiness(self):
        for i in [2, 3]:
            body = get_testdata('schema.org',
                                'LocalBusiness.{:03d}.html'.format(i))
            expected = json.loads(
                get_testdata(
                    'schema.org',
                    'LocalBusiness.{:03d}.json'.format(i)).decode('UTF-8'))

            mde = MicrodataExtractor()
            data = mde.extract(body)
            self.assertEqual(data, expected)
示例#25
0
    def test_schemaorg_MusicRecording(self):
        for i in [1]:
            body = get_testdata('schema.org',
                                'MusicRecording.{:03d}.html'.format(i))
            expected = json.loads(
                get_testdata(
                    'schema.org',
                    'MusicRecording.{:03d}.json'.format(i)).decode('UTF-8'))

            mde = MicrodataExtractor()
            data = mde.extract(body)
            self.assertDictEqual(data, expected)
示例#26
0
    def parse(self, response):
        mde = MicrodataExtractor()
        data = mde.extract(response.body)
        for item in data['items']:
            if item['type'] in self.target_types:
                item['indexed_date'] = datetime.date.today().isoformat()
                item['url'] = response.url
                yield item

        for url in response.xpath('//a/@href').extract():
            if '/events' in url:
                yield scrapy.Request(response.urljoin(url),
                                     callback=self.parse)
示例#27
0
    def extract(self, html_text: str,
                extract_title: bool = False,
                extract_meta: bool = False,
                extract_microdata: bool = False,
                microdata_base_url: str = "",
                extract_json_ld: bool = False,
                extract_rdfa: bool = False,
                rdfa_base_url: str = "") \
            -> List[Extraction]:
        """
        Args:
            html_text (str): input html string to be extracted
            extract_title (bool): True if string of 'title' tag needs to be extracted, return as { "title": "..." }
            extract_meta (bool): True if string of 'meta' tags needs to be extracted, return as { "meta": { "author": "...", ...}}
            extract_microdata (bool): True if microdata needs to be extracted, returns as { "microdata": [...] }
            microdata_base_url (str): base namespace url for microdata, empty string if no base url is specified
            extract_json_ld (bool): True if json-ld needs to be extracted, return as { "json-ld": [...] }
            extract_rdfa (bool): True if rdfs needs to be extracted, returns as { "rdfa": [...] }
            rdfa_base_url (str): base namespace url for rdfa, empty string if no base url is specified

        Returns:
            List[Extraction]: the list of extraction or the empty list if there are no matches.
        """
        res = list()
        soup = BeautifulSoup(html_text, 'html.parser')

        if soup.title and extract_title:
            title = self._wrap_data("title", soup.title.string.encode('utf-8').decode('utf-8'))
            res.append(title)

        if soup.title and extract_meta:
            meta_content = self._wrap_meta_content(soup.find_all("meta"))
            meta_data = self._wrap_data("meta", meta_content)
            res.append(meta_data)

        if extract_microdata:
            mde = MicrodataExtractor()
            mde_data = self._wrap_data("microdata", mde.extract(html_text, microdata_base_url))
            res.append(mde_data)

        if extract_json_ld:
            jslde = JsonLdExtractor()
            jslde_data = self._wrap_data("json-ld", jslde.extract(html_text))
            res.append(jslde_data)

        if extract_rdfa:
            rdfae = RDFaExtractor()
            rdfae_data = self._wrap_data("rdfa", rdfae.extract(html_text, rdfa_base_url))
            res.append(rdfae_data)

        return res
示例#28
0
    def parse_product(self, response):
        mde = MicrodataExtractor()
        try:
            micro_data = mde.extract(response.body)['items']
            gen_data = filter(lambda a: a['type'] == 'http://schema.org/Product',
                              micro_data)[0]['properties']
            categories = [c['properties']['title']
                          for c in filter(lambda d: d['type'] == 'http://data-vocabulary.org/Breadcrumb',
                                          micro_data)][1:]
        except:
            self.log('WARNING => Wrong product page in %s' % response.url)
            return

        main_name = gen_data['name']
        if isinstance(main_name, list):
            main_name = main_name[0]
        main_brand = gen_data.get('brand', '')
        if isinstance(main_brand, list):
            main_brand = main_brand[0]

        variants = response.xpath('//input[@name="ctl00$cphMain$ctl00$hidProductVariants"]/@value').extract()
        if variants:
            data = json.loads(self.html_parser.unescape(variants[0]))
            for d in data:
                for var in d['Variants']:
                    for size_data in var['Variants']:
                        color_name = size_data.get('Article', dict()).get('ColorName', '')
                        size_data = size_data['Article']
                        url = self.product_url % size_data
                        identifier = size_data['ItemOfferId']
                        name = main_name + ', ' + color_name + ', ' + size_data['FriendlySize']
                        price = size_data['WebInfo']['ArticlePriceDisplay']['FormattedSalePriceAfterWithCharges']
                        shipping_cost = size_data['FormattedDeliveryFee']
                        loader = ProductLoader(item=Product(), response=response)
                        loader.add_value('name', name)
                        loader.add_value('url', url)
                        loader.add_value('identifier', identifier)
                        loader.add_value('sku', size_data['ProductId'])
                        loader.add_value('price', extract_price_eu(price))
                        if shipping_cost:
                            loader.add_value('shipping_cost', extract_price_eu(shipping_cost))
                        loader.add_value('image_url', gen_data['image'][-1])
                        if size_data['AvailabilityCode'] != 'L':
                            loader.add_value('stock', 0)
                        loader.add_value('category', categories)
                        if main_brand:
                            loader.add_value('brand', main_brand)
                        yield loader.load_item()
        else:
            self.log('WARNING: Variants not found in => %s' % response.url)
示例#29
0
    def parse_item(self, response):
        items = []
        def microdata2jsonld(md):
            if md.get('properties'):
                item = md['properties']
                item['@type'] = md.get('type')
                return item
        items += map(microdata2jsonld, MicrodataExtractor().extract(
            response.body_as_unicode(), response.url)['items'])
        items += JsonLdExtractor().extract(
            response.body_as_unicode(), response.url)['items']

        if not items:
            self.logger.debug("No Microdata items found for %s", response.url)

        self.logger.debug("Checking URL for item: %s" , items)

        for item in items:
            if not item or not item.get('url'):
                self.logger.debug("No URL for item: %s" , item)
                continue

            if item['url'] != response.url:
                self.logger.debug("Not in main URL, go there..")
                yield Request(item['url'], callback=self.parse_item)
            else:
                item['@type'] = item.get('type')
                self.logger.debug("Parsed microdata: %s" % item)
                yield item
示例#30
0
def async_extruct(url, microdata=True, jsonld=True):
    response.content_type = 'application/json'
    resp = requests.get(url, timeout=30)

    parser = lxml.html.HTMLParser(encoding=resp.encoding)
    lxmldoc = lxml.html.fromstring(resp.content, parser=parser)

    result = {'url': url, 'status': 'ok'}

    if microdata:
        mde = MicrodataExtractor(nested=True)
        result['microdata'] = mde.extract_items(lxmldoc, url)

    if jsonld:
        jsonlde = JsonLdExtractor()
        result['json-ld'] = jsonlde.extract_items(lxmldoc)

    return result
示例#31
0
def extract(htmlstring, url='http://www.example.com/', encoding="UTF-8"):
    domparser = XmlDomHTMLParser(encoding=encoding)
    tree = fromstring(htmlstring, parser=domparser)
    return {
        name: extractor.extract_items(tree, url=url)
        for name, extractor in (('json-ld', JsonLdExtractor()),
                                ('microdata', MicrodataExtractor()),
                                ('rdfa', RDFaExtractor()))
    }
    def parse_item(self, response):
        """Parse the recipe to get title and ingredients."""
        schema_type = "mde"
        mde = MicrodataExtractor()
        data = mde.extract(response.body)
        if len(data['items']) == 0:
            jslde = JsonLdExtractor()
            data = jslde.extract(response.body)
            schema_type = "jsonld"

        if schema_type == "mde":
            recipe = data['items'][2]['properties']
            recipe_output_item = RecipeItem()
            recipe_output_item['recipe_name'] = recipe['name']
            recipe_output_item['ingredients'] = [
                ingredient for ingredient in recipe['ingredients']
                if ingredient not in ['', 'Add all ingredients to list']
            ]
            recipe_tags = recipe['recipeCategory']
            if 'recipeCuisine' in recipe.keys():
                recipe_tags.append(recipe['recipeCuisine'])
            recipe_output_item['tags'] = recipe_tags
            try:
                recipe_output_item['description'] = recipe['description']
            except KeyError:
                recipe_output_item['description'] = None
            recipe_output_item['url'] = recipe['url']
        elif schema_type == "jsonld":
            recipe = data['items'][0]
            recipe_output_item = RecipeItem()
            recipe_output_item['recipe_name'] = recipe['name']
            recipe_output_item['ingredients'] = recipe['ingredients']
            recipe_output_item['tags'] = [
                tag['properties']['title'] for tag in data['items'][1:]
            ]
            try:
                recipe_output_item['description'] = recipe['description']
            except KeyError:
                recipe_output_item['description'] = None
            recipe_output_item['url'] = recipe['url']

        yield recipe_output_item
示例#33
0
  def parse_item(self, response):
    """Parse the recipe to get title and ingredients."""
    schema_type = "mde"
    mde = MicrodataExtractor()
    data = mde.extract(response.body)
    if len(data['items']) == 0:
      jslde = JsonLdExtractor()
      data = jslde.extract(response.body)
      schema_type = "jsonld"

    if schema_type == "mde":
      recipe = data['items'][2]['properties']
      recipe_output_item = RecipeItem()
      recipe_output_item['recipe_name'] = recipe['name']
      recipe_output_item['ingredients'] = [
          ingredient for ingredient in recipe['ingredients']
          if ingredient not in ['', 'Add all ingredients to list']
      ]
      recipe_tags = recipe['recipeCategory']
      if 'recipeCuisine' in recipe.keys():
        recipe_tags.append(recipe['recipeCuisine'])
      recipe_output_item['tags'] = recipe_tags
      try:
        recipe_output_item['description'] = recipe['description']
      except KeyError:
        recipe_output_item['description'] = None
      recipe_output_item['url'] = recipe['url']
    elif schema_type == "jsonld":
      recipe = data['items'][0]
      recipe_output_item = RecipeItem()
      recipe_output_item['recipe_name'] = recipe['name']
      recipe_output_item['ingredients'] = recipe['ingredients']
      recipe_output_item['tags'] = [tag['properties']['title']
                                    for tag in data['items'][1:]]
      try:
        recipe_output_item['description'] = recipe['description']
      except KeyError:
        recipe_output_item['description'] = None
      recipe_output_item['url'] = recipe['url']

    yield recipe_output_item
示例#34
0
def async_extruct(url, microdata=True, jsonld=True):
    resp = requests.get(url, timeout=30)

    parser = lxml.html.HTMLParser(encoding=resp.encoding)
    lxmldoc = lxml.html.fromstring(resp.content, parser=parser)

    result = {"url": url, "status": "ok"}

    if microdata:
        mde = MicrodataExtractor(nested=True)
        microdata = mde.extract_items(lxmldoc, url)
        if microdata.get("items", []):
            result["microdata"] = microdata

    if jsonld:
        jsonlde = JsonLdExtractor()
        jsonldata = jsonlde.extract_items(lxmldoc)
        if jsonldata.get("items", []):
            result["json-ld"] = jsonldata

    return result
示例#35
0
 def parse_listing(self, response):
 	mde = MicrodataExtractor()
     data = mde.extract(response.body)['items']
     if data:
         it = {}
         it['shop'] = data[0]['properties']
         prod = data[1]['properties']
         it.update(prod['offerDetails']['properties'])
         it['name'] = prod['name']
         it['url'] = response.url
         it['properties'] = [x for x in response.css('#item-overview .properties li::text').extract() \
             if all(y not in x.lower() for y in ['materials','feedback', 'favorited', 'ships'])]
         it['materials'] = e0(response.css('#overview-materials::text'))
         it['origin'] = e0(response.css('.origin::text'))
         it['imgs'] = response.css('#image-carousel img::attr("src")').extract()
         it['description'] = e0(response.css("#description-text"))
         it['tags'] = response.css('#listing-tag-list li a::text').extract()
         it['fineprints'] = [x.strip() for x in response.css('#fineprint li::text').extract()[:4]]
         it['rating'] = response.css('.review-rating meta::attr("content")').extract()
         #it['html'] = response.body
         yield it
示例#36
0
def async_extruct(url, microdata=True, jsonld=True):
    response.content_type = 'application/json'
    resp = requests.get(url, timeout=30)

    parser = lxml.html.HTMLParser(encoding=resp.encoding)
    lxmldoc = lxml.html.fromstring(resp.content, parser=parser)

    result = {'url': url, 'status': 'ok'}

    if microdata:
        mde = MicrodataExtractor(nested=True)
        microdata = mde.extract_items(lxmldoc, url)
        if microdata.get('items', []):
            result['microdata'] = microdata

    if jsonld:
        jsonlde = JsonLdExtractor()
        jsonldata = jsonlde.extract_items(lxmldoc)
        if jsonldata.get('items', []):
            result['json-ld'] = jsonldata

    return result
示例#37
0
    def extract(self, html_text: str,
                extract_title: bool = False,
                extract_meta: bool = False,
                extract_microdata: bool = False,
                extract_json_ld: bool = False,
                extract_rdfa: bool = False) \
            -> List[Extraction]:

        res = list()
        soup = BeautifulSoup(html_text, 'html.parser')

        if soup.title and extract_title:
            title = self.wrap_data(
                "title",
                soup.title.string.encode('utf-8').decode('utf-8'))
            res.append(title)

        if soup.title and extract_meta:
            meta_content = self.wrap_meta_content(soup.find_all("meta"))
            meta_data = self.wrap_data("meta", meta_content)
            res.append(meta_data)

        if extract_microdata:
            mde = MicrodataExtractor()
            mde_data = self.wrap_data("microdata", mde.extract(html_text))
            res.append(mde_data)

        if extract_json_ld:
            jslde = JsonLdExtractor()
            jslde_data = self.wrap_data("json-ld", jslde.extract(html_text))
            res.append(jslde_data)

        if extract_rdfa:
            rdfae = RDFaExtractor()
            rdfae_data = self.wrap_data("rdfa", rdfae.extract(html_text))
            res.append(rdfae_data)

        return res
示例#38
0
def get_review_items_from_microdata(spider,
                                    review_type,
                                    response,
                                    product,
                                    reviews_xpath=None,
                                    pros_xpath=None,
                                    cons_xpath=None):
    '''
    Get all reviews from a page, useful for user review pages with microdata
    :param spider: the spider we use to scrape the site
    :param review_type: type of the reviews to scrape, should be either USER or PRO
    :param response: an instance of Scrapy's Response object where reviews will be scraped from
    :param product: the product item the reviews are written for
    :param reviews_xpath: the xpath to extract review selectors from 'response'
    :param pros_xpath: the xpath to extract pros from review selectors
    :param cons_xpath: the xpath to extract cons from review selectors
    :return: list of all review items extracted
    '''
    mde = MicrodataExtractor()
    try:
        items = mde.extract(response.text)
    except XMLSyntaxError:
        return []  # Nothing to do here...

    all_review_extracts = [
        i for i in items if i['type'] == "http://schema.org/Review"
    ]
    all_pros = []
    all_cons = []

    if reviews_xpath:
        add_pros_and_cons = True
        all_reviews = response.xpath(reviews_xpath)
        for single_review in all_reviews:
            if pros_xpath:
                pros = spider.extract_all(single_review.xpath(pros_xpath),
                                          separator=' ; ')
            else:
                pros = ''
            if cons_xpath:
                cons = spider.extract_all(single_review.xpath(cons_xpath),
                                          separator=' ; ')
            else:
                cons = ''
            all_pros.append(pros)
            all_cons.append(cons)

        if len(all_pros) != len(all_review_extracts) or len(all_cons) != len(
                all_review_extracts):
            spider.logger.warning(
                "Number of reviews extracted from xpath is different from number of review microdata."
            )
            add_pros_and_cons = False
    else:
        add_pros_and_cons = False

    review_items = []
    for index, item in enumerate(all_review_extracts):
        if add_pros_and_cons:
            review = review_microdata_extruct(item,
                                              product=product,
                                              tp=review_type,
                                              pros=all_pros[index],
                                              cons=all_cons[index])
        else:
            review = review_microdata_extruct(item,
                                              product=product,
                                              tp=review_type)
        review_items.append(review)

    return review_items
示例#39
0
class RISJMetadataExtractor(object):
    """An extruct-based metadata extractor"""

    # TODO: Extend to microdata and RDFa, replacing bespoke xpath code. Then
    #       test on body of crawlers!
    def __init__(self, response, microdata=False, jsonld=False, rdfa=False):
        self.response = response
        self.microdata = microdata
        self.jsonld = jsonld
        self.rdfa = rdfa

        if rdfa:
            try:
                self.rdfae = RDFaExtractor()
                self.rdfadata = self.rdfae.extract(self.response.text,
                                                   url=self.response.url)
            except JSONDecodeError:
                pass
        if microdata:
            try:
                self.mde = MicrodataExtractor()
                self.mdedata = self.mde.extract(self.response.text)
            except JSONDecodeError:
                pass
        if jsonld:
            try:
                self.jlde = JsonLdExtractor()
                self.jldata = self.jlde.extract(self.response.text)
            except (JSONDecodeError, TypeError):
                self.jldata = []
            finally:
                # Sometimes we get this in the meta dict from RISJExtractJSONLD
                self.jldata.extend(self.response.meta.get('json-ld', []))

    def extract_newsarticle_schemaorg(self,
                                      microdata=None,
                                      jsonld=None,
                                      rdfa=None):
        """Extract schema.org NewsArticle metadata, encoded using any
           supported metadata format. Note that we only try to extract the
           *first* block of NewsArticle data for each method (which is then
           combined with the first extracted from other methods if more than
           one is selected."""
        if microdata is None:
            microdata = self.microdata
        if jsonld is None:
            jsonld = self.jsonld
        if rdfa is None:
            rdfa = self.rdfa

        outd = {}
        if jsonld:
            for d in self.jldata:
                #                logger.debug('Analysing JSON-LD data: '+pformat(d))
                try:
                    if (re.match(r'https?://schema.org/?', d['@context'])
                            and d['@type'] == 'NewsArticle'):
                        outd.update(d)
                except (KeyError, TypeError):
                    continue
        if microdata:
            for d in self.mdedata:
                logger.debug('Analysing W3C microdata: ' + pformat(d))
                if re.match(r'https?://schema.org/NewsArticle/?',
                            d.get('type', '')):
                    outd.update(d)
        if rdfa:
            raise NotImplementedError
#        logger.debug('Returning schema.org NewsArticle: '+pformat(outd))
        return outd
示例#40
0
def extract(htmlstring,
            base_url=None,
            encoding="UTF-8",
            syntaxes=SYNTAXES,
            errors='strict',
            uniform=False,
            return_html_node=False,
            schema_context='http://schema.org',
            with_og_array=False,
            **kwargs):
    """htmlstring: string with valid html document;
       base_url: base url of the html document
       encoding: encoding of the html document
       syntaxes: list of syntaxes to extract, default SYNTAXES
       errors: set to 'log' to log the exceptions, 'ignore' to ignore them
               or 'strict'(default) to raise them
       uniform: if True uniform output format of all syntaxes to a list of dicts.
                Returned dicts structure:
                {'@context': 'http://example.com',
                 '@type': 'example_type',
                 /* All other the properties in keys here */
                 }
       return_html_node: if True, it includes into the result a HTML node of
                         respective embedded metadata under 'htmlNode' key.
                         The feature is supported only by microdata syntax.
                         Each node is of `lxml.etree.Element` type.
       schema_context: schema's context for current page"""
    if base_url is None and 'url' in kwargs:
        warnings.warn('"url" argument is deprecated, please use "base_url"',
                      DeprecationWarning,
                      stacklevel=2)
        base_url = kwargs.pop('url')
    if kwargs:
        raise TypeError('Unexpected keyword arguments')
    if not (isinstance(syntaxes, list) and all(v in SYNTAXES
                                               for v in syntaxes)):
        raise ValueError("syntaxes must be a list with any or all (default) of"
                         "these values: {}".format(SYNTAXES))
    if errors not in ['log', 'ignore', 'strict']:
        raise ValueError('Invalid error command, valid values are either "log"'
                         ', "ignore" or "strict"')
    try:
        tree = parse_xmldom_html(htmlstring, encoding=encoding)
    except Exception as e:
        if errors == 'ignore':
            return {}
        if errors == 'log':
            logger.exception('Failed to parse html, raises {}'.format(e))
            return {}
        if errors == 'strict':
            raise
    processors = []
    if 'microdata' in syntaxes:
        processors.append(
            ('microdata',
             MicrodataExtractor(add_html_node=return_html_node).extract_items,
             tree))
    if 'json-ld' in syntaxes:
        processors.append((
            'json-ld',
            JsonLdExtractor().extract_items,
            tree,
        ))
    if 'opengraph' in syntaxes:
        processors.append(
            ('opengraph', OpenGraphExtractor().extract_items, tree))
    if 'microformat' in syntaxes:
        processors.append(
            ('microformat', MicroformatExtractor().extract_items, htmlstring))
    if 'rdfa' in syntaxes:
        processors.append((
            'rdfa',
            RDFaExtractor().extract_items,
            tree,
        ))
    if 'dublincore' in syntaxes:
        processors.append((
            'dublincore',
            DublinCoreExtractor().extract_items,
            tree,
        ))
    output = {}
    for syntax, extract, document in processors:
        try:
            output[syntax] = list(extract(document, base_url=base_url))
        except Exception as e:
            if errors == 'log':
                logger.exception('Failed to extract {}, raises {}'.format(
                    syntax, e))
            if errors == 'ignore':
                pass
            if errors == 'strict':
                raise
    if uniform:
        uniform_processors = []
        if 'microdata' in syntaxes:
            uniform_processors.append((
                'microdata',
                _umicrodata_microformat,
                output['microdata'],
                schema_context,
            ))
        if 'microformat' in syntaxes:
            uniform_processors.append((
                'microformat',
                _umicrodata_microformat,
                output['microformat'],
                'http://microformats.org/wiki/',
            ))
        if 'opengraph' in syntaxes:
            uniform_processors.append((
                'opengraph',
                _uopengraph,
                output['opengraph'],
                None,
            ))
        if 'dublincore' in syntaxes:
            uniform_processors.append((
                'dublincore',
                _udublincore,
                output['dublincore'],
                None,
            ))

        for syntax, uniform, raw, schema_context in uniform_processors:
            try:
                if syntax == 'opengraph':
                    output[syntax] = uniform(raw, with_og_array=with_og_array)
                elif syntax == 'dublincore':
                    output[syntax] = uniform(raw)
                else:
                    output[syntax] = uniform(raw, schema_context)
            except Exception as e:
                if errors == 'ignore':
                    output[syntax] = []
                if errors == 'log':
                    output[syntax] = []
                    logger.exception(
                        'Failed to uniform extracted for {}, raises {}'.format(
                            syntax, e))
                if errors == 'strict':
                    raise

    return output
示例#41
0
	def parse(self, response):		
		selector = Selector(response=response)

		extractor = MicrodataExtractor()
		items = extractor.extract(response.body_as_unicode(), response.url)
		print items
示例#42
0
def extract(htmlstring,
            base_url=None,
            encoding="UTF-8",
            syntaxes=SYNTAXES,
            errors='strict',
            uniform=False,
            schema_context='http://schema.org',
            **kwargs):
    """htmlstring: string with valid html document;
       base_url: base url of the html document
       encoding: encoding of the html document
       syntaxes: list of syntaxes to extract, default SYNTAXES
       errors: set to 'log' to log the exceptions, 'ignore' to ignore them
               or 'strict'(default) to raise them
       uniform: if True uniform output format of all syntaxes to a list of dicts.
                Returned dicts structure:
                {'@context': 'http://example.com',
                 '@type': 'example_type',
                 /* All other the properties in keys here */
                 }
       schema_context: schema's context for current page"""
    if base_url is None and 'url' in kwargs:
        warnings.warn('"url" argument is deprecated, please use "base_url"',
                      DeprecationWarning)
        base_url = kwargs.pop('url')
    if kwargs:
        raise TypeError('Unexpected keyword arguments')
    if not (isinstance(syntaxes, list) and all(v in SYNTAXES
                                               for v in syntaxes)):
        raise ValueError("syntaxes must be a list with any or all (default) of"
                         "these values: {}".format(SYNTAXES))
    if errors not in ['log', 'ignore', 'strict']:
        raise ValueError('Invalid error command, valid values are either "log"'
                         ', "ignore" or "strict"')
    domparser = XmlDomHTMLParser(encoding=encoding)
    tree = fromstring(htmlstring, parser=domparser)
    processors = []
    if 'microdata' in syntaxes:
        processors.append(
            ('microdata', MicrodataExtractor().extract_items, tree))
    if 'json-ld' in syntaxes:
        processors.append(('json-ld', JsonLdExtractor().extract_items, tree))
    if 'opengraph' in syntaxes:
        processors.append(
            ('opengraph', OpenGraphExtractor().extract_items, tree))
    if 'microformat' in syntaxes:
        processors.append(
            ('microformat', MicroformatExtractor().extract_items, htmlstring))
    if 'rdfa' in syntaxes:
        processors.append(('rdfa', RDFaExtractor().extract_items, tree))
    output = {}
    for label, extract, document in processors:
        try:
            output[label] = list(extract(document, base_url=base_url))
        except Exception:
            if errors == 'log':
                logger.exception('Failed to extract {}'.format(label))
            if errors == 'ignore':
                pass
            if errors == 'strict':
                raise

    if uniform:
        if 'microdata' in syntaxes:
            output['microdata'] = _umicrodata_microformat(
                output['microdata'], schema_context=schema_context)
        if 'microformat' in syntaxes:
            output['microformat'] = _umicrodata_microformat(
                output['microformat'],
                schema_context='http://microformats.org/wiki/')
        if 'opengraph' in syntaxes:
            output['opengraph'] = _uopengraph(output['opengraph'])
    return output
示例#43
0
import os.path
import re
import io, shutil
from extruct.w3cmicrodata import MicrodataExtractor

import sys
from subprocess import *
from shlex import split

from pprint import pprint as pp

from zipfile import ZipFile, BadZipFile

from bs4 import BeautifulSoup

mde = MicrodataExtractor()


def microdata(html):
    microdata = mde.extract(html)
    microdata = microdata['items'][0]['properties']

    def attrget(item, key):
        keys = key.split('.')
        for key in keys:
            item = item.get(key, {})
        if item == {}: return None
        return item

    keys = ('url', 'name', 'version', 'aggregateRating.properties.ratingCount',
            'aggregateRating.properties.ratingValue', 'image',