def extract_all_json_ld(html_text): try: jslde = JsonLdExtractor() data = jslde.extract(html_text) return data except: return {}
def test_schemaorg_CreativeWork(self): body = get_testdata('schema.org', 'CreativeWork.001.html') expected = json.loads(get_testdata('schema.org', 'CreativeWork.001.jsonld').decode('UTF-8')) jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertEqual(data, expected)
def test_schemaorg_CreativeWork(self): for i in [1]: body = get_testdata('schema.org', 'CreativeWork.{:03d}.html'.format(i)) expected = json.loads(get_testdata('schema.org', 'CreativeWork.{:03d}.jsonld'.format(i)).decode('UTF-8')) jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertEqual(data, expected)
def test_schemaorg_CreativeWork(self): for i in [1]: body = get_testdata('schema.org', 'CreativeWork.{:03d}.html'.format(i)) expected = json.loads(get_testdata('schema.org', 'CreativeWork.{:03d}.jsonld'.format(i)).decode('UTF-8')) jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertDictEqual(data, expected)
def test_songkick(self): page = "Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015" body = get_testdata('songkick', '{}.html'.format(page)) expected = json.loads(get_testdata('songkick', '{}.jsonld'.format(page)).decode('UTF-8')) jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertEqual(data, expected)
def test_null(self): page = "null_ld_mock" body = get_testdata('misc', '{}.html'.format(page)) expected = json.loads( get_testdata('misc', '{}.jsonld'.format(page)).decode('UTF-8')) jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertEqual(data, expected)
def parse_item(self, response): self.logger.info('Hi, this is an item page! %s', response.url) # Fields to output for each tool my_keys = [ '@id', '@type', 'applicationCategory', 'name', 'alternateName', 'description', 'url', 'sameAs', 'image', 'genre', 'softwareVersion', 'softwareRequirements', 'operatingSystem', 'downloadUrl', 'installUrl' ] #oururl= urlopen(url).read() #print(oururl) extractor = JsonLdExtractor() #with urllib.request.urlopen(url) as response: #html_text = response.read().decode('utf-8') my_items = extractor.extract(response.body_as_unicode(), response.url) # If this is a -tool page only: #my_items = extractor.extract(html_text) this_item = my_items['items'][0]['@graph'][0] my_item = {} # output all basic items for this_key in my_keys: # print(this_key,'\t',this_item[this_key]) my_item[this_key] = this_item[this_key] # get license license_type = this_item['license']['@type'] license_text = self.removeNonAscii(this_item['license']['text']) # print('license_type','\t',license_type) # print('license_text','\t',license_text) my_item['license_type'] = license_type my_item['license_text'] = license_text # Get pmcrefcount of first only Entrez.email = "*****@*****.**" first_pub = this_item['publication'][0] pmcrefcount = 0 if 'pubmed' in first_pub['url']: this_pmid = first_pub['url'].split('/')[-1:] pmcrefcount = Entrez.read( Entrez.efetch(db="pubmed", id=this_pmid, rettype="docsum"))[0]['PmcRefCount'] # print('primary_pub','\t',first_pub['name']) # print('primary_pub_url','\t',first_pub['url']) # print('primary_pub_pmcrefcount','\t',pmcrefcount) my_item['primary_pub'] = first_pub['name'] my_item['primary_pub_url'] = first_pub['url'] my_item['primary_pub_pmcrefcount'] = pmcrefcount #return my_item yield my_item return my_item
def test_jsonld_with_comments(self): for prefix in ['JoinAction.001', 'AllocateAction.001']: body = get_testdata('schema.org.invalid', '{}.html'.format(prefix)) name = '{}.jsonld'.format(prefix) expected = json.loads(get_testdata('schema.org.invalid', name).decode('UTF-8')) jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertEqual(data, expected) for prefix in ['JoinAction.001', 'AllocateAction.001', ]: body = get_testdata('custom.invalid', '{}.html'.format(prefix)) expected = json.loads(get_testdata('custom.invalid', '{}.jsonld'.format(prefix)).decode('UTF-8')) jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertEqual(data, expected)
def parse_item(self, response): """Parse the recipe to get title and ingredients.""" schema_type = "mde" mde = MicrodataExtractor() data = mde.extract(response.body) # print('response.body:', response.body) # print('data:', data) if len(data) == 0: jslde = JsonLdExtractor() data = jslde.extract(response.body) schema_type = "jsonld" if schema_type == "mde": recipe = data[0]['properties'] # recipe_output_item = RecipeItem() # recipe_output_item['recipe_name'] = recipe['name'] # recipe_output_item['ingredients'] = [ # ingredient for ingredient in recipe['ingredients'] # if ingredient not in ['', 'Add all ingredients to list'] # ] # recipe_output_item['tags'] = [tag['properties']['title'] # for tag in data['items'][1:]] # try: # recipe_output_item['description'] = recipe['description'] # except KeyError: # recipe_output_item['description'] = None # recipe_output_item['url'] = recipe['url'] elif schema_type == "jsonld": recipe = data['items'][0] # recipe_output_item = RecipeItem() # recipe_output_item['recipe_name'] = recipe['name'] # recipe_output_item['ingredients'] = recipe['ingredients'] # recipe_output_item['tags'] = [tag['properties']['title'] # for tag in data['items'][1:]] # try: # recipe_output_item['description'] = recipe['description'] # except KeyError: # recipe_output_item['description'] = None # recipe_output_item['url'] = recipe['url'] properties = [ 'totalTime', 'nutrition', 'name', 'author', 'url', 'image', 'recipeIngredient', 'aggregateRating', 'recipeYield', 'recipeInstructions', 'video', 'mainEntityOfPage', 'cookTime', 'recipeCategory', 'review', 'prepTime', 'description' ] recipe_output_item = RecipeItem() for prop in properties: try: recipe_output_item[prop] = recipe[prop] except KeyError: recipe_output_item[prop] = None yield recipe_output_item
def test_songkick(self): for page in [ "Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015", #"Maxïmo Park Gigography, Tour History & Past Concerts", #"Years & Years Tickets, Tour Dates 2015 & Concerts", ]: body = get_testdata('songkick', '{}.html'.format(page)) expected = json.loads(get_testdata('songkick', '{}.jsonld'.format(page)).decode('UTF-8')) jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertDictEqual(data, expected)
def test_songkick(self): for page in [ "Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015", #"Maxïmo Park Gigography, Tour History & Past Concerts", #"Years & Years Tickets, Tour Dates 2015 & Concerts", ]: body = get_testdata('songkick', '{}.html'.format(page)) expected = json.loads(get_testdata('songkick', '{}.jsonld'.format(page)).decode('UTF-8')) jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertEqual(data, expected)
def set_data(self): """ Extract JSON-LD data from self.page and store as a string in self.data. Return True if data is present, False if not. Raise RuntimeError if JSON-LD cannot be extracted (e.g. if it is malformed).""" jslde = JsonLdExtractor() try: self.data = dumps(jslde.extract(self.page, base_url=self.base_url)) if self.data == "[]": return False else: return True except: msg = "Error extracting data from page" raise RuntimeError(msg)
def extract_json_ld(html, typ_str): try: jslde = JsonLdExtractor() items = jslde.extract(html) for item in items: item_context = item.get('@context', '').rstrip(' /') if (item_context == 'http://schema.org' or item_context == 'https://schema.org') \ and item.get('@type', '') == typ_str: return item return None except: return None
def extract(self, html_text: str, extract_title: bool = False, extract_meta: bool = False, extract_microdata: bool = False, microdata_base_url: str = "", extract_json_ld: bool = False, extract_rdfa: bool = False, rdfa_base_url: str = "") \ -> List[Extraction]: """ Args: html_text (str): input html string to be extracted extract_title (bool): True if string of 'title' tag needs to be extracted, return as { "title": "..." } extract_meta (bool): True if string of 'meta' tags needs to be extracted, return as { "meta": { "author": "...", ...}} extract_microdata (bool): True if microdata needs to be extracted, returns as { "microdata": [...] } microdata_base_url (str): base namespace url for microdata, empty string if no base url is specified extract_json_ld (bool): True if json-ld needs to be extracted, return as { "json-ld": [...] } extract_rdfa (bool): True if rdfs needs to be extracted, returns as { "rdfa": [...] } rdfa_base_url (str): base namespace url for rdfa, empty string if no base url is specified Returns: List[Extraction]: the list of extraction or the empty list if there are no matches. """ res = list() soup = BeautifulSoup(html_text, 'html.parser') if soup.title and extract_title: title = self._wrap_data("title", soup.title.string.encode('utf-8').decode('utf-8')) res.append(title) if soup.title and extract_meta: meta_content = self._wrap_meta_content(soup.find_all("meta")) meta_data = self._wrap_data("meta", meta_content) res.append(meta_data) if extract_microdata: mde = MicrodataExtractor() mde_data = self._wrap_data("microdata", mde.extract(html_text, microdata_base_url)) res.append(mde_data) if extract_json_ld: jslde = JsonLdExtractor() jslde_data = self._wrap_data("json-ld", jslde.extract(html_text)) res.append(jslde_data) if extract_rdfa: rdfae = RDFaExtractor() rdfae_data = self._wrap_data("rdfa", rdfae.extract(html_text, rdfa_base_url)) res.append(rdfae_data) return res
def loadSOGraphFromHtml(html, url): """ Extract jsonld entries from provided HTML text Args: html(string): HTML text to be parsed Returns: ConjunctiveGraph: Graph loaded from html """ jslde = JsonLdExtractor() json_content = jslde.extract(html) g = ConjunctiveGraph() for json_data in json_content: g_data = loadSOGraph(data=json.dumps(json_data), publicID=url) g += g_data return g
def parse_scene(self, response): jslde = JsonLdExtractor() json = jslde.extract(response.text) data = {} for obj in json: if obj['@type'] == 'VideoObject': data = obj break item = SceneItem() item['title'] = self.cleanup_title(data['name']) item['description'] = self.cleanup_description(data['description']) item['image'] = data['thumbnail'] item['image_blob'] = None item['id'] = self.get_id(response) item['trailer'] = data['contentUrl'] item['url'] = response.url item['date'] = self.parse_date(data['datePublished']).isoformat() item['site'] = data['author']['name'] item['network'] = self.network item['parent'] = item['site'] item['performers'] = [] for model in data['actor']: item['performers'].append(model['name']) item['tags'] = self.get_tags(response) days = int(self.days) if days > 27375: filterdate = "0000-00-00" else: filterdate = date.today() - timedelta(days) filterdate = filterdate.strftime('%Y-%m-%d') if self.debug: if not item['date'] > filterdate: item['filtered'] = "Scene filtered due to date restraint" print(item) else: if filterdate: if item['date'] > filterdate: yield item else: yield item
def parse_item(self, response): """Parse the recipe to get title and ingredients.""" schema_type = "mde" mde = MicrodataExtractor() data = mde.extract(response.body) if len(data['items']) == 0: jslde = JsonLdExtractor() data = jslde.extract(response.body) schema_type = "jsonld" if schema_type == "mde": recipe = data['items'][2]['properties'] recipe_output_item = RecipeItem() recipe_output_item['recipe_name'] = recipe['name'] recipe_output_item['ingredients'] = [ ingredient for ingredient in recipe['ingredients'] if ingredient not in ['', 'Add all ingredients to list'] ] recipe_tags = recipe['recipeCategory'] if 'recipeCuisine' in recipe.keys(): recipe_tags.append(recipe['recipeCuisine']) recipe_output_item['tags'] = recipe_tags try: recipe_output_item['description'] = recipe['description'] except KeyError: recipe_output_item['description'] = None recipe_output_item['url'] = recipe['url'] elif schema_type == "jsonld": recipe = data['items'][0] recipe_output_item = RecipeItem() recipe_output_item['recipe_name'] = recipe['name'] recipe_output_item['ingredients'] = recipe['ingredients'] recipe_output_item['tags'] = [ tag['properties']['title'] for tag in data['items'][1:] ] try: recipe_output_item['description'] = recipe['description'] except KeyError: recipe_output_item['description'] = None recipe_output_item['url'] = recipe['url'] yield recipe_output_item
def parse_item(self, response): """Parse the recipe to get title and ingredients.""" schema_type = "mde" mde = MicrodataExtractor() data = mde.extract(response.body) if len(data['items']) == 0: jslde = JsonLdExtractor() data = jslde.extract(response.body) schema_type = "jsonld" if schema_type == "mde": recipe = data['items'][2]['properties'] recipe_output_item = RecipeItem() recipe_output_item['recipe_name'] = recipe['name'] recipe_output_item['ingredients'] = [ ingredient for ingredient in recipe['ingredients'] if ingredient not in ['', 'Add all ingredients to list'] ] recipe_tags = recipe['recipeCategory'] if 'recipeCuisine' in recipe.keys(): recipe_tags.append(recipe['recipeCuisine']) recipe_output_item['tags'] = recipe_tags try: recipe_output_item['description'] = recipe['description'] except KeyError: recipe_output_item['description'] = None recipe_output_item['url'] = recipe['url'] elif schema_type == "jsonld": recipe = data['items'][0] recipe_output_item = RecipeItem() recipe_output_item['recipe_name'] = recipe['name'] recipe_output_item['ingredients'] = recipe['ingredients'] recipe_output_item['tags'] = [tag['properties']['title'] for tag in data['items'][1:]] try: recipe_output_item['description'] = recipe['description'] except KeyError: recipe_output_item['description'] = None recipe_output_item['url'] = recipe['url'] yield recipe_output_item
def extract(self, html_text: str, extract_title: bool = False, extract_meta: bool = False, extract_microdata: bool = False, extract_json_ld: bool = False, extract_rdfa: bool = False) \ -> List[Extraction]: res = list() soup = BeautifulSoup(html_text, 'html.parser') if soup.title and extract_title: title = self.wrap_data( "title", soup.title.string.encode('utf-8').decode('utf-8')) res.append(title) if soup.title and extract_meta: meta_content = self.wrap_meta_content(soup.find_all("meta")) meta_data = self.wrap_data("meta", meta_content) res.append(meta_data) if extract_microdata: mde = MicrodataExtractor() mde_data = self.wrap_data("microdata", mde.extract(html_text)) res.append(mde_data) if extract_json_ld: jslde = JsonLdExtractor() jslde_data = self.wrap_data("json-ld", jslde.extract(html_text)) res.append(jslde_data) if extract_rdfa: rdfae = RDFaExtractor() rdfae_data = self.wrap_data("rdfa", rdfae.extract(html_text)) res.append(rdfae_data) return res
def parse_job(self, response): # Create the loader using the response # E.G. : l.add_xpath('item', '*xpath*', re='*expression*') jslde = JsonLdExtractor() data = jslde.extract(response.text) data = data[0] l = ItemLoader(item=JobsjsonItem(), response=response) l.add_value('search_postcode', self.search_postcode) l.add_value('search_radius', self.search_radius) l.add_value('date_scraped', time.strftime("%Y-%m-%d %H:%M:%S")) l.add_value('date_posted', data['datePosted']) l.add_value('valid_until', data['validThrough']) l.add_value('job_id', response.url, re='\d{7}') l.add_value('job_title', data['title']) l.add_value('job_type', data['employmentType']) l.add_value('location', ",".join([ data['jobLocation']['address']['addressLocality'], data['jobLocation']['address']['addressRegion'], data['jobLocation']['address']['postalCode'], data['jobLocation']['address']['addressCountry']])) l.add_xpath('contact_name', '//table[@class="job-listing-table"]//tr[8]//td//text()') l.add_xpath('start_date', '//table[@class="job-listing-table"]//tr[6]//td//text()') try: l.add_value('salary_min', data['baseSalary']['value']['value']) except: l.add_value('salary_min', 'NA') l.add_value('listed_on', data['datePosted']) l.add_value('recruiter', data['hiringOrganization']['name']) l.add_value('recruiter_url', data['hiringOrganization']['sameAs']) try: l.add_value('job_reference', data['identifier']['value']) except: l.add_value('job_reference', 'NA') l.add_value('url', response.url) l.add_value('job_description', data['description']) l.add_value('job_skills', data['skills']) l.add_value('addressLocality', data['jobLocation']['address']['addressLocality']) l.add_value('addressRegion', data['jobLocation']['address']['addressRegion']) l.add_value('postalCode', data['jobLocation']['address']['postalCode']) l.add_value('addressCountry', data['jobLocation']['address']['addressCountry']) return l.load_item()
def _extract_json_data(self, blob): html = blob.download_as_string().decode() jslde = JsonLdExtractor() data = jslde.extract(html) return json.dumps(strip_strings(data))
def _check_jsonld(self, body, expected): jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertEqual(data, expected)
def getJsonLdFromHTML(html_text): """ Returns an array of json_ld structures found in the provided html_text """ jslde = JsonLdExtractor() return jslde.extract(html_text)
class RISJMetadataExtractor(object): """An extruct-based metadata extractor""" # TODO: Extend to microdata and RDFa, replacing bespoke xpath code. Then # test on body of crawlers! def __init__(self, response, microdata=False, jsonld=False, rdfa=False): self.response = response self.microdata = microdata self.jsonld = jsonld self.rdfa = rdfa if rdfa: try: self.rdfae = RDFaExtractor() self.rdfadata = self.rdfae.extract(self.response.text, url=self.response.url) except JSONDecodeError: pass if microdata: try: self.mde = MicrodataExtractor() self.mdedata = self.mde.extract(self.response.text) except JSONDecodeError: pass if jsonld: try: self.jlde = JsonLdExtractor() self.jldata = self.jlde.extract(self.response.text) except (JSONDecodeError, TypeError): self.jldata = [] finally: # Sometimes we get this in the meta dict from RISJExtractJSONLD self.jldata.extend(self.response.meta.get('json-ld', [])) def extract_newsarticle_schemaorg(self, microdata=None, jsonld=None, rdfa=None): """Extract schema.org NewsArticle metadata, encoded using any supported metadata format. Note that we only try to extract the *first* block of NewsArticle data for each method (which is then combined with the first extracted from other methods if more than one is selected.""" if microdata is None: microdata = self.microdata if jsonld is None: jsonld = self.jsonld if rdfa is None: rdfa = self.rdfa outd = {} if jsonld: for d in self.jldata: # logger.debug('Analysing JSON-LD data: '+pformat(d)) try: if (re.match(r'https?://schema.org/?', d['@context']) and d['@type'] == 'NewsArticle'): outd.update(d) except (KeyError, TypeError): continue if microdata: for d in self.mdedata: logger.debug('Analysing W3C microdata: ' + pformat(d)) if re.match(r'https?://schema.org/NewsArticle/?', d.get('type', '')): outd.update(d) if rdfa: raise NotImplementedError # logger.debug('Returning schema.org NewsArticle: '+pformat(outd)) return outd