Пример #1
0
def extract_all_json_ld(html_text):
    try:
        jslde = JsonLdExtractor()
        data = jslde.extract(html_text)
        return data
    except:
        return {}
Пример #2
0
    def test_schemaorg_CreativeWork(self):
        body = get_testdata('schema.org', 'CreativeWork.001.html')
        expected = json.loads(get_testdata('schema.org', 'CreativeWork.001.jsonld').decode('UTF-8'))

        jsonlde = JsonLdExtractor()
        data = jsonlde.extract(body)
        self.assertEqual(data, expected)
Пример #3
0
    def test_schemaorg_CreativeWork(self):
        for i in [1]:
            body = get_testdata('schema.org', 'CreativeWork.{:03d}.html'.format(i))
            expected = json.loads(get_testdata('schema.org', 'CreativeWork.{:03d}.jsonld'.format(i)).decode('UTF-8'))

            jsonlde = JsonLdExtractor()
            data = jsonlde.extract(body)
            self.assertEqual(data, expected)
Пример #4
0
    def test_schemaorg_CreativeWork(self):
        for i in [1]:
            body = get_testdata('schema.org', 'CreativeWork.{:03d}.html'.format(i))
            expected = json.loads(get_testdata('schema.org', 'CreativeWork.{:03d}.jsonld'.format(i)).decode('UTF-8'))

            jsonlde = JsonLdExtractor()
            data = jsonlde.extract(body)
            self.assertDictEqual(data, expected)
Пример #5
0
    def test_songkick(self):
        page = "Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015"
        body = get_testdata('songkick', '{}.html'.format(page))
        expected = json.loads(get_testdata('songkick', '{}.jsonld'.format(page)).decode('UTF-8'))

        jsonlde = JsonLdExtractor()
        data = jsonlde.extract(body)
        self.assertEqual(data, expected)
Пример #6
0
    def test_null(self):
        page = "null_ld_mock"
        body = get_testdata('misc', '{}.html'.format(page))
        expected = json.loads(
            get_testdata('misc', '{}.jsonld'.format(page)).decode('UTF-8'))

        jsonlde = JsonLdExtractor()
        data = jsonlde.extract(body)
        self.assertEqual(data, expected)
Пример #7
0
    def parse_item(self, response):
        self.logger.info('Hi, this is an item page! %s', response.url)

        # Fields to output for each tool
        my_keys = [
            '@id', '@type', 'applicationCategory', 'name', 'alternateName',
            'description', 'url', 'sameAs', 'image', 'genre',
            'softwareVersion', 'softwareRequirements', 'operatingSystem',
            'downloadUrl', 'installUrl'
        ]

        #oururl= urlopen(url).read()
        #print(oururl)
        extractor = JsonLdExtractor()
        #with urllib.request.urlopen(url) as response:
        #html_text = response.read().decode('utf-8')

        my_items = extractor.extract(response.body_as_unicode(), response.url)

        # If this is a -tool page only:
        #my_items = extractor.extract(html_text)
        this_item = my_items['items'][0]['@graph'][0]
        my_item = {}

        # output all basic items
        for this_key in my_keys:
            #      print(this_key,'\t',this_item[this_key])
            my_item[this_key] = this_item[this_key]

        # get license
        license_type = this_item['license']['@type']
        license_text = self.removeNonAscii(this_item['license']['text'])
        #   print('license_type','\t',license_type)
        #   print('license_text','\t',license_text)
        my_item['license_type'] = license_type
        my_item['license_text'] = license_text

        # Get pmcrefcount of first only
        Entrez.email = "*****@*****.**"
        first_pub = this_item['publication'][0]
        pmcrefcount = 0
        if 'pubmed' in first_pub['url']:
            this_pmid = first_pub['url'].split('/')[-1:]
            pmcrefcount = Entrez.read(
                Entrez.efetch(db="pubmed", id=this_pmid,
                              rettype="docsum"))[0]['PmcRefCount']
        #   print('primary_pub','\t',first_pub['name'])
        #   print('primary_pub_url','\t',first_pub['url'])
        #   print('primary_pub_pmcrefcount','\t',pmcrefcount)
        my_item['primary_pub'] = first_pub['name']
        my_item['primary_pub_url'] = first_pub['url']
        my_item['primary_pub_pmcrefcount'] = pmcrefcount

        #return my_item

        yield my_item
        return my_item
Пример #8
0
    def test_jsonld_with_comments(self):
        for prefix in ['JoinAction.001', 'AllocateAction.001']:
            body = get_testdata('schema.org.invalid', '{}.html'.format(prefix))
            name = '{}.jsonld'.format(prefix)
            expected = json.loads(get_testdata('schema.org.invalid', name).decode('UTF-8'))

            jsonlde = JsonLdExtractor()
            data = jsonlde.extract(body)
            self.assertEqual(data, expected)
        for prefix in ['JoinAction.001',
                       'AllocateAction.001',
                ]:
            body = get_testdata('custom.invalid', '{}.html'.format(prefix))
            expected = json.loads(get_testdata('custom.invalid', '{}.jsonld'.format(prefix)).decode('UTF-8'))

            jsonlde = JsonLdExtractor()
            data = jsonlde.extract(body)
            self.assertEqual(data, expected)
Пример #9
0
    def parse_item(self, response):
        """Parse the recipe to get title and ingredients."""
        schema_type = "mde"
        mde = MicrodataExtractor()
        data = mde.extract(response.body)
        # print('response.body:', response.body)
        # print('data:', data)
        if len(data) == 0:
            jslde = JsonLdExtractor()
            data = jslde.extract(response.body)
            schema_type = "jsonld"

        if schema_type == "mde":
            recipe = data[0]['properties']
            # recipe_output_item = RecipeItem()
            # recipe_output_item['recipe_name'] = recipe['name']
            # recipe_output_item['ingredients'] = [
            #     ingredient for ingredient in recipe['ingredients']
            #     if ingredient not in ['', 'Add all ingredients to list']
            # ]
            # recipe_output_item['tags'] = [tag['properties']['title']
            #                               for tag in data['items'][1:]]
            # try:
            #   recipe_output_item['description'] = recipe['description']
            # except KeyError:
            #   recipe_output_item['description'] = None
            # recipe_output_item['url'] = recipe['url']
        elif schema_type == "jsonld":
            recipe = data['items'][0]
            # recipe_output_item = RecipeItem()
            # recipe_output_item['recipe_name'] = recipe['name']
            # recipe_output_item['ingredients'] = recipe['ingredients']
            # recipe_output_item['tags'] = [tag['properties']['title']
            #                               for tag in data['items'][1:]]
            # try:
            #   recipe_output_item['description'] = recipe['description']
            # except KeyError:
            #   recipe_output_item['description'] = None
            # recipe_output_item['url'] = recipe['url']

        properties = [
            'totalTime', 'nutrition', 'name', 'author', 'url', 'image',
            'recipeIngredient', 'aggregateRating', 'recipeYield',
            'recipeInstructions', 'video', 'mainEntityOfPage', 'cookTime',
            'recipeCategory', 'review', 'prepTime', 'description'
        ]
        recipe_output_item = RecipeItem()
        for prop in properties:
            try:
                recipe_output_item[prop] = recipe[prop]
            except KeyError:
                recipe_output_item[prop] = None

        yield recipe_output_item
Пример #10
0
    def test_songkick(self):
        for page in [
                "Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015",
                #"Maxïmo Park Gigography, Tour History & Past Concerts",
                #"Years & Years Tickets, Tour Dates 2015 & Concerts",
            ]:
            body = get_testdata('songkick', '{}.html'.format(page))
            expected = json.loads(get_testdata('songkick', '{}.jsonld'.format(page)).decode('UTF-8'))

            jsonlde = JsonLdExtractor()
            data = jsonlde.extract(body)
            self.assertDictEqual(data, expected)
Пример #11
0
    def test_songkick(self):
        for page in [
                "Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015",
                #"Maxïmo Park Gigography, Tour History & Past Concerts",
                #"Years & Years Tickets, Tour Dates 2015 & Concerts",
            ]:
            body = get_testdata('songkick', '{}.html'.format(page))
            expected = json.loads(get_testdata('songkick', '{}.jsonld'.format(page)).decode('UTF-8'))

            jsonlde = JsonLdExtractor()
            data = jsonlde.extract(body)
            self.assertEqual(data, expected)
Пример #12
0
 def set_data(self):
     """ Extract JSON-LD data from self.page and store as a string in self.data. Return True if data is present, False if not. Raise RuntimeError if JSON-LD cannot be extracted (e.g. if it is malformed)."""
     jslde = JsonLdExtractor()
     try:
         self.data = dumps(jslde.extract(self.page, base_url=self.base_url))
         if self.data == "[]":
             return False
         else:
             return True
     except:
         msg = "Error extracting data from page"
         raise RuntimeError(msg)
Пример #13
0
def extract_json_ld(html, typ_str):
    try:
        jslde = JsonLdExtractor()
        items = jslde.extract(html)
        for item in items:
            item_context = item.get('@context', '').rstrip(' /')
            if (item_context == 'http://schema.org' or item_context == 'https://schema.org') \
                    and item.get('@type', '') == typ_str:
                return item

        return None
    except:
        return None
Пример #14
0
    def extract(self, html_text: str,
                extract_title: bool = False,
                extract_meta: bool = False,
                extract_microdata: bool = False,
                microdata_base_url: str = "",
                extract_json_ld: bool = False,
                extract_rdfa: bool = False,
                rdfa_base_url: str = "") \
            -> List[Extraction]:
        """
        Args:
            html_text (str): input html string to be extracted
            extract_title (bool): True if string of 'title' tag needs to be extracted, return as { "title": "..." }
            extract_meta (bool): True if string of 'meta' tags needs to be extracted, return as { "meta": { "author": "...", ...}}
            extract_microdata (bool): True if microdata needs to be extracted, returns as { "microdata": [...] }
            microdata_base_url (str): base namespace url for microdata, empty string if no base url is specified
            extract_json_ld (bool): True if json-ld needs to be extracted, return as { "json-ld": [...] }
            extract_rdfa (bool): True if rdfs needs to be extracted, returns as { "rdfa": [...] }
            rdfa_base_url (str): base namespace url for rdfa, empty string if no base url is specified

        Returns:
            List[Extraction]: the list of extraction or the empty list if there are no matches.
        """
        res = list()
        soup = BeautifulSoup(html_text, 'html.parser')

        if soup.title and extract_title:
            title = self._wrap_data("title", soup.title.string.encode('utf-8').decode('utf-8'))
            res.append(title)

        if soup.title and extract_meta:
            meta_content = self._wrap_meta_content(soup.find_all("meta"))
            meta_data = self._wrap_data("meta", meta_content)
            res.append(meta_data)

        if extract_microdata:
            mde = MicrodataExtractor()
            mde_data = self._wrap_data("microdata", mde.extract(html_text, microdata_base_url))
            res.append(mde_data)

        if extract_json_ld:
            jslde = JsonLdExtractor()
            jslde_data = self._wrap_data("json-ld", jslde.extract(html_text))
            res.append(jslde_data)

        if extract_rdfa:
            rdfae = RDFaExtractor()
            rdfae_data = self._wrap_data("rdfa", rdfae.extract(html_text, rdfa_base_url))
            res.append(rdfae_data)

        return res
Пример #15
0
def loadSOGraphFromHtml(html, url):
    """
    Extract jsonld entries from provided HTML text

    Args:
        html(string): HTML text to be parsed

    Returns:
        ConjunctiveGraph: Graph loaded from html

    """
    jslde = JsonLdExtractor()
    json_content = jslde.extract(html)
    g = ConjunctiveGraph()
    for json_data in json_content:
        g_data = loadSOGraph(data=json.dumps(json_data), publicID=url)
        g += g_data
    return g
Пример #16
0
    def parse_scene(self, response):
        jslde = JsonLdExtractor()
        json = jslde.extract(response.text)
        data = {}
        for obj in json:
            if obj['@type'] == 'VideoObject':
                data = obj
                break

        item = SceneItem()
        item['title'] = self.cleanup_title(data['name'])
        item['description'] = self.cleanup_description(data['description'])
        item['image'] = data['thumbnail']
        item['image_blob'] = None
        item['id'] = self.get_id(response)
        item['trailer'] = data['contentUrl']
        item['url'] = response.url
        item['date'] = self.parse_date(data['datePublished']).isoformat()
        item['site'] = data['author']['name']
        item['network'] = self.network
        item['parent'] = item['site']

        item['performers'] = []
        for model in data['actor']:
            item['performers'].append(model['name'])

        item['tags'] = self.get_tags(response)
        days = int(self.days)
        if days > 27375:
            filterdate = "0000-00-00"
        else:
            filterdate = date.today() - timedelta(days)
            filterdate = filterdate.strftime('%Y-%m-%d')

        if self.debug:
            if not item['date'] > filterdate:
                item['filtered'] = "Scene filtered due to date restraint"
            print(item)
        else:
            if filterdate:
                if item['date'] > filterdate:
                    yield item
            else:
                yield item
    def parse_item(self, response):
        """Parse the recipe to get title and ingredients."""
        schema_type = "mde"
        mde = MicrodataExtractor()
        data = mde.extract(response.body)
        if len(data['items']) == 0:
            jslde = JsonLdExtractor()
            data = jslde.extract(response.body)
            schema_type = "jsonld"

        if schema_type == "mde":
            recipe = data['items'][2]['properties']
            recipe_output_item = RecipeItem()
            recipe_output_item['recipe_name'] = recipe['name']
            recipe_output_item['ingredients'] = [
                ingredient for ingredient in recipe['ingredients']
                if ingredient not in ['', 'Add all ingredients to list']
            ]
            recipe_tags = recipe['recipeCategory']
            if 'recipeCuisine' in recipe.keys():
                recipe_tags.append(recipe['recipeCuisine'])
            recipe_output_item['tags'] = recipe_tags
            try:
                recipe_output_item['description'] = recipe['description']
            except KeyError:
                recipe_output_item['description'] = None
            recipe_output_item['url'] = recipe['url']
        elif schema_type == "jsonld":
            recipe = data['items'][0]
            recipe_output_item = RecipeItem()
            recipe_output_item['recipe_name'] = recipe['name']
            recipe_output_item['ingredients'] = recipe['ingredients']
            recipe_output_item['tags'] = [
                tag['properties']['title'] for tag in data['items'][1:]
            ]
            try:
                recipe_output_item['description'] = recipe['description']
            except KeyError:
                recipe_output_item['description'] = None
            recipe_output_item['url'] = recipe['url']

        yield recipe_output_item
Пример #18
0
  def parse_item(self, response):
    """Parse the recipe to get title and ingredients."""
    schema_type = "mde"
    mde = MicrodataExtractor()
    data = mde.extract(response.body)
    if len(data['items']) == 0:
      jslde = JsonLdExtractor()
      data = jslde.extract(response.body)
      schema_type = "jsonld"

    if schema_type == "mde":
      recipe = data['items'][2]['properties']
      recipe_output_item = RecipeItem()
      recipe_output_item['recipe_name'] = recipe['name']
      recipe_output_item['ingredients'] = [
          ingredient for ingredient in recipe['ingredients']
          if ingredient not in ['', 'Add all ingredients to list']
      ]
      recipe_tags = recipe['recipeCategory']
      if 'recipeCuisine' in recipe.keys():
        recipe_tags.append(recipe['recipeCuisine'])
      recipe_output_item['tags'] = recipe_tags
      try:
        recipe_output_item['description'] = recipe['description']
      except KeyError:
        recipe_output_item['description'] = None
      recipe_output_item['url'] = recipe['url']
    elif schema_type == "jsonld":
      recipe = data['items'][0]
      recipe_output_item = RecipeItem()
      recipe_output_item['recipe_name'] = recipe['name']
      recipe_output_item['ingredients'] = recipe['ingredients']
      recipe_output_item['tags'] = [tag['properties']['title']
                                    for tag in data['items'][1:]]
      try:
        recipe_output_item['description'] = recipe['description']
      except KeyError:
        recipe_output_item['description'] = None
      recipe_output_item['url'] = recipe['url']

    yield recipe_output_item
Пример #19
0
    def extract(self, html_text: str,
                extract_title: bool = False,
                extract_meta: bool = False,
                extract_microdata: bool = False,
                extract_json_ld: bool = False,
                extract_rdfa: bool = False) \
            -> List[Extraction]:

        res = list()
        soup = BeautifulSoup(html_text, 'html.parser')

        if soup.title and extract_title:
            title = self.wrap_data(
                "title",
                soup.title.string.encode('utf-8').decode('utf-8'))
            res.append(title)

        if soup.title and extract_meta:
            meta_content = self.wrap_meta_content(soup.find_all("meta"))
            meta_data = self.wrap_data("meta", meta_content)
            res.append(meta_data)

        if extract_microdata:
            mde = MicrodataExtractor()
            mde_data = self.wrap_data("microdata", mde.extract(html_text))
            res.append(mde_data)

        if extract_json_ld:
            jslde = JsonLdExtractor()
            jslde_data = self.wrap_data("json-ld", jslde.extract(html_text))
            res.append(jslde_data)

        if extract_rdfa:
            rdfae = RDFaExtractor()
            rdfae_data = self.wrap_data("rdfa", rdfae.extract(html_text))
            res.append(rdfae_data)

        return res
    def parse_job(self, response):
        # Create the loader using the response
        # E.G. : l.add_xpath('item', '*xpath*', re='*expression*')
        jslde = JsonLdExtractor()
        data = jslde.extract(response.text)
        data = data[0]

        l = ItemLoader(item=JobsjsonItem(), response=response)
        l.add_value('search_postcode', self.search_postcode)
        l.add_value('search_radius', self.search_radius)
        l.add_value('date_scraped', time.strftime("%Y-%m-%d %H:%M:%S"))
        l.add_value('date_posted', data['datePosted'])
        l.add_value('valid_until', data['validThrough'])
        l.add_value('job_id', response.url, re='\d{7}')
        l.add_value('job_title', data['title'])
        l.add_value('job_type', data['employmentType'])
        l.add_value('location', ",".join([  data['jobLocation']['address']['addressLocality'], data['jobLocation']['address']['addressRegion'], data['jobLocation']['address']['postalCode'], data['jobLocation']['address']['addressCountry']]))
        l.add_xpath('contact_name', '//table[@class="job-listing-table"]//tr[8]//td//text()')
        l.add_xpath('start_date', '//table[@class="job-listing-table"]//tr[6]//td//text()')
        try:
            l.add_value('salary_min', data['baseSalary']['value']['value'])
        except:
            l.add_value('salary_min', 'NA')
        l.add_value('listed_on', data['datePosted'])
        l.add_value('recruiter', data['hiringOrganization']['name'])
        l.add_value('recruiter_url', data['hiringOrganization']['sameAs'])
        try:
            l.add_value('job_reference', data['identifier']['value'])
        except:
            l.add_value('job_reference', 'NA')
        l.add_value('url', response.url)
        l.add_value('job_description', data['description'])
        l.add_value('job_skills', data['skills'])
        l.add_value('addressLocality', data['jobLocation']['address']['addressLocality'])
        l.add_value('addressRegion', data['jobLocation']['address']['addressRegion'])
        l.add_value('postalCode', data['jobLocation']['address']['postalCode'])
        l.add_value('addressCountry', data['jobLocation']['address']['addressCountry'])
        return l.load_item()
Пример #21
0
 def _extract_json_data(self, blob):
     html = blob.download_as_string().decode()
     jslde = JsonLdExtractor()
     data = jslde.extract(html)
     return json.dumps(strip_strings(data))
Пример #22
0
 def _check_jsonld(self, body, expected):
     jsonlde = JsonLdExtractor()
     data = jsonlde.extract(body)
     self.assertEqual(data, expected)
Пример #23
0
def getJsonLdFromHTML(html_text):
    """
    Returns an array of json_ld structures found in the provided html_text
    """
    jslde = JsonLdExtractor()
    return jslde.extract(html_text)
Пример #24
0
class RISJMetadataExtractor(object):
    """An extruct-based metadata extractor"""

    # TODO: Extend to microdata and RDFa, replacing bespoke xpath code. Then
    #       test on body of crawlers!
    def __init__(self, response, microdata=False, jsonld=False, rdfa=False):
        self.response = response
        self.microdata = microdata
        self.jsonld = jsonld
        self.rdfa = rdfa

        if rdfa:
            try:
                self.rdfae = RDFaExtractor()
                self.rdfadata = self.rdfae.extract(self.response.text,
                                                   url=self.response.url)
            except JSONDecodeError:
                pass
        if microdata:
            try:
                self.mde = MicrodataExtractor()
                self.mdedata = self.mde.extract(self.response.text)
            except JSONDecodeError:
                pass
        if jsonld:
            try:
                self.jlde = JsonLdExtractor()
                self.jldata = self.jlde.extract(self.response.text)
            except (JSONDecodeError, TypeError):
                self.jldata = []
            finally:
                # Sometimes we get this in the meta dict from RISJExtractJSONLD
                self.jldata.extend(self.response.meta.get('json-ld', []))

    def extract_newsarticle_schemaorg(self,
                                      microdata=None,
                                      jsonld=None,
                                      rdfa=None):
        """Extract schema.org NewsArticle metadata, encoded using any
           supported metadata format. Note that we only try to extract the
           *first* block of NewsArticle data for each method (which is then
           combined with the first extracted from other methods if more than
           one is selected."""
        if microdata is None:
            microdata = self.microdata
        if jsonld is None:
            jsonld = self.jsonld
        if rdfa is None:
            rdfa = self.rdfa

        outd = {}
        if jsonld:
            for d in self.jldata:
                #                logger.debug('Analysing JSON-LD data: '+pformat(d))
                try:
                    if (re.match(r'https?://schema.org/?', d['@context'])
                            and d['@type'] == 'NewsArticle'):
                        outd.update(d)
                except (KeyError, TypeError):
                    continue
        if microdata:
            for d in self.mdedata:
                logger.debug('Analysing W3C microdata: ' + pformat(d))
                if re.match(r'https?://schema.org/NewsArticle/?',
                            d.get('type', '')):
                    outd.update(d)
        if rdfa:
            raise NotImplementedError
#        logger.debug('Returning schema.org NewsArticle: '+pformat(outd))
        return outd