def test_wikipedia_xhtml_rdfa_no_prefix(self): body = get_testdata('misc', 'Portfolio_Niels_Lubberman.html') expected = json.loads( get_testdata('misc', 'Portfolio_Niels_Lubberman.json').decode('UTF-8')) rdfae = RDFaExtractor() data = rdfae.extract(body, base_url='http://nielslubberman.nl/drupal/') self.assertJsonLDEqual(data, expected)
def test_expanded_opengraph_support(self): body = get_testdata('misc','expanded_OG_support_test.html') expected = json.loads( get_testdata('misc','expanded_OG_support_test.json' ).decode('UTF-8')) rdfae = RDFaExtractor() data = rdfae.extract(body, base_url='http://www.example.com/index.html') self.assertJsonLDEqual(data,expected)
def test_wikipedia_xhtml_rdfa(self): fileprefix = 'xhtml+rdfa' body = get_testdata('wikipedia', fileprefix + '.html').decode('UTF-8') expected = json.loads( get_testdata('wikipedia', fileprefix + '.expanded.json').decode('UTF-8')) rdfae = RDFaExtractor() data = rdfae.extract(body, url='http://www.exaple.com/index.html') self.assertJsonLDEqual(data, expected)
def test_wikipedia_xhtml_rdfa(self): fileprefix = 'xhtml+rdfa' body = get_testdata('wikipedia', fileprefix + '.html').decode('UTF-8') expected = json.loads( get_testdata('wikipedia', fileprefix + '.expanded.json' ).decode('UTF-8')) rdfae = RDFaExtractor() data = rdfae.extract(body, url='http://www.exaple.com/index.html') self.assertJsonLDEqual(data, expected)
def test_w3c_rdf11primer(self): for i in [14]: fileprefix = 'w3c.rdf11primer.example{:03d}'.format(i) body = get_testdata('w3crdfa', fileprefix + '.html').decode('UTF-8') expected = json.loads( get_testdata('w3crdfa', fileprefix + '.expanded.json' ).decode('UTF-8')) rdfae = RDFaExtractor() data = rdfae.extract(body, url='http://www.exaple.com/index.html') self.assertJsonLDEqual(data, expected)
def test_w3c_rdf11primer(self): for i in [14]: fileprefix = 'w3c.rdf11primer.example{:03d}'.format(i) body = get_testdata('w3crdfa', fileprefix + '.html') expected = json.loads( get_testdata('w3crdfa', fileprefix + '.expanded.json').decode('UTF-8')) rdfae = RDFaExtractor() data = rdfae.extract(body, url='http://www.example.com/index.html') self.assertJsonLDEqual(data, expected)
def test_w3c_rdfaprimer(self): for i in [5, 6, 7, 8, 9, 10, 11, 15]: fileprefix = 'w3c.rdfaprimer.example{:03d}'.format(i) print(fileprefix) body = get_testdata('w3crdfa', fileprefix + '.html') expected = json.loads( get_testdata('w3crdfa', fileprefix + '.expanded.json' ).decode('UTF-8')) rdfae = RDFaExtractor() data = rdfae.extract(body, base_url='http://www.example.com/index.html') self.assertJsonLDEqual(data, expected) # This is for testing that the fix to issue 116 does not affect # severely rdfa output even in a presence of a bug in the code def mocked_fix_order(x, y, z): raise Exception() rdfae._fix_order = mocked_fix_order data = rdfae.extract(body, base_url='http://www.example.com/index.html') self.assertJsonLDEqual(data, expected)
def extract(self, html_text: str, extract_title: bool = False, extract_meta: bool = False, extract_microdata: bool = False, microdata_base_url: str = "", extract_json_ld: bool = False, extract_rdfa: bool = False, rdfa_base_url: str = "") \ -> List[Extraction]: """ Args: html_text (str): input html string to be extracted extract_title (bool): True if string of 'title' tag needs to be extracted, return as { "title": "..." } extract_meta (bool): True if string of 'meta' tags needs to be extracted, return as { "meta": { "author": "...", ...}} extract_microdata (bool): True if microdata needs to be extracted, returns as { "microdata": [...] } microdata_base_url (str): base namespace url for microdata, empty string if no base url is specified extract_json_ld (bool): True if json-ld needs to be extracted, return as { "json-ld": [...] } extract_rdfa (bool): True if rdfs needs to be extracted, returns as { "rdfa": [...] } rdfa_base_url (str): base namespace url for rdfa, empty string if no base url is specified Returns: List[Extraction]: the list of extraction or the empty list if there are no matches. """ res = list() soup = BeautifulSoup(html_text, 'html.parser') if soup.title and extract_title: title = self._wrap_data("title", soup.title.string.encode('utf-8').decode('utf-8')) res.append(title) if soup.title and extract_meta: meta_content = self._wrap_meta_content(soup.find_all("meta")) meta_data = self._wrap_data("meta", meta_content) res.append(meta_data) if extract_microdata: mde = MicrodataExtractor() mde_data = self._wrap_data("microdata", mde.extract(html_text, microdata_base_url)) res.append(mde_data) if extract_json_ld: jslde = JsonLdExtractor() jslde_data = self._wrap_data("json-ld", jslde.extract(html_text)) res.append(jslde_data) if extract_rdfa: rdfae = RDFaExtractor() rdfae_data = self._wrap_data("rdfa", rdfae.extract(html_text, rdfa_base_url)) res.append(rdfae_data) return res
def test_w3c_rdfaprimer(self): for i in [5, 6, 7, 8, 9, 10, 11, 15]: fileprefix = 'w3c.rdfaprimer.example{:03d}'.format(i) print(fileprefix) body = get_testdata('w3crdfa', fileprefix + '.html').decode('UTF-8') expected = json.loads( get_testdata('w3crdfa', fileprefix + '.expanded.json' ).decode('UTF-8')) rdfae = RDFaExtractor() data = rdfae.extract(body, url='http://www.example.com/index.html') print("extracted:\n%s" % pformat(tupleize(data))) print("expected:\n%s" % pformat(tupleize(expected))) print("extracted:\n%s" % self.prettify(data)) print("expected:\n%s" % self.prettify(expected)) self.assertJsonLDEqual(data, expected)
def test_w3c_rdfaprimer(self): for i in [5, 6, 7, 8, 9, 10, 11, 15]: fileprefix = 'w3c.rdfaprimer.example{:03d}'.format(i) print(fileprefix) body = get_testdata('w3crdfa', fileprefix + '.html') expected = json.loads( get_testdata('w3crdfa', fileprefix + '.expanded.json').decode('UTF-8')) rdfae = RDFaExtractor() data = rdfae.extract(body, url='http://www.example.com/index.html') print("extracted:\n%s" % pformat(tupleize(data))) print("expected:\n%s" % pformat(tupleize(expected))) print("extracted:\n%s" % self.prettify(data)) print("expected:\n%s" % self.prettify(expected)) self.assertJsonLDEqual(data, expected)
def extract(self, html_text: str, extract_title: bool = False, extract_meta: bool = False, extract_microdata: bool = False, extract_json_ld: bool = False, extract_rdfa: bool = False) \ -> List[Extraction]: res = list() soup = BeautifulSoup(html_text, 'html.parser') if soup.title and extract_title: title = self.wrap_data( "title", soup.title.string.encode('utf-8').decode('utf-8')) res.append(title) if soup.title and extract_meta: meta_content = self.wrap_meta_content(soup.find_all("meta")) meta_data = self.wrap_data("meta", meta_content) res.append(meta_data) if extract_microdata: mde = MicrodataExtractor() mde_data = self.wrap_data("microdata", mde.extract(html_text)) res.append(mde_data) if extract_json_ld: jslde = JsonLdExtractor() jslde_data = self.wrap_data("json-ld", jslde.extract(html_text)) res.append(jslde_data) if extract_rdfa: rdfae = RDFaExtractor() rdfae_data = self.wrap_data("rdfa", rdfae.extract(html_text)) res.append(rdfae_data) return res
def extract_all_rdfa(response): rdfa_extractor = RDFaExtractor() return rdfa_extractor.extract(response.text, url=response.url)
class RISJMetadataExtractor(object): """An extruct-based metadata extractor""" # TODO: Extend to microdata and RDFa, replacing bespoke xpath code. Then # test on body of crawlers! def __init__(self, response, microdata=False, jsonld=False, rdfa=False): self.response = response self.microdata = microdata self.jsonld = jsonld self.rdfa = rdfa if rdfa: try: self.rdfae = RDFaExtractor() self.rdfadata = self.rdfae.extract(self.response.text, url=self.response.url) except JSONDecodeError: pass if microdata: try: self.mde = MicrodataExtractor() self.mdedata = self.mde.extract(self.response.text) except JSONDecodeError: pass if jsonld: try: self.jlde = JsonLdExtractor() self.jldata = self.jlde.extract(self.response.text) except (JSONDecodeError, TypeError): self.jldata = [] finally: # Sometimes we get this in the meta dict from RISJExtractJSONLD self.jldata.extend(self.response.meta.get('json-ld', [])) def extract_newsarticle_schemaorg(self, microdata=None, jsonld=None, rdfa=None): """Extract schema.org NewsArticle metadata, encoded using any supported metadata format. Note that we only try to extract the *first* block of NewsArticle data for each method (which is then combined with the first extracted from other methods if more than one is selected.""" if microdata is None: microdata = self.microdata if jsonld is None: jsonld = self.jsonld if rdfa is None: rdfa = self.rdfa outd = {} if jsonld: for d in self.jldata: # logger.debug('Analysing JSON-LD data: '+pformat(d)) try: if (re.match(r'https?://schema.org/?', d['@context']) and d['@type'] == 'NewsArticle'): outd.update(d) except (KeyError, TypeError): continue if microdata: for d in self.mdedata: logger.debug('Analysing W3C microdata: ' + pformat(d)) if re.match(r'https?://schema.org/NewsArticle/?', d.get('type', '')): outd.update(d) if rdfa: raise NotImplementedError # logger.debug('Returning schema.org NewsArticle: '+pformat(outd)) return outd
def get_rdfa_from_warc(warc_file_no, path): global iteration_count global report_at_every_nth_step rdfaFileID = 1 htmlURL = '' data = '' append = False rdfaExtractor = RDFaExtractor() if not os.path.exists('RDFa Files\\WARC_{0}'.format(warc_file_no)): os.makedirs('RDFa Files\\WARC_{0}'.format(warc_file_no)) if not os.path.exists('XML Files\\WARC_{0}'.format(warc_file_no)): os.makedirs('XML Files\\WARC_{0}'.format(warc_file_no)) print('[INFO/PROGRESS] The file being processed: {0}'.format(path)) with open(path, encoding='utf-8', errors='replace') as file: for line in file: if debug and iteration_count % report_at_every_nth_step == 0: print("[DEBUG/PROGRESS] Processing line #{0:n}".format( iteration_count)) if 'WARC/1.0' in line and append: append = False try: rdfaData = rdfaExtractor.extract(data, base_url=htmlURL) if rdfaData != []: with open('RDFa Files\\WARC_{0}\\RDFa_{1}.txt'.format( warc_file_no, rdfaFileID), 'w', encoding='utf-8') as f: f.write('URL: {0}\n\n'.format(htmlURL)) f.write(str(rdfaData)) f.close() ConvertToXML.convertInstant( str(rdfaData), "XML Files\\WARC_{0}\\RDFa_{1}.xml".format( warc_file_no, rdfaFileID)) if debug: print( "[DEBUG/PROGRESS] Processed file #{0} at URI {1} successfully" .format(rdfaFileID, htmlURL)) except json.decoder.JSONDecodeError as jde: print( '[ERROR] Current file (#{0}) could not be converted to RDF/XML: {1} | This JSON-LD may be invalid.' .format(rdfaFileID, str(jde))) except lxml.etree.ParserError as pe: print( '[ERROR] Current file (#{0}) could not be converted to RDF/XML: {1} | This file may not have a valid RDFa representation.' .format(rdfaFileID, str(pe))) except Exception as exc: if str(exc).startswith('Can\'t split'): print( '[ERROR] Current file (#{0}) could not be converted to RDF/XML: {1} | This file may be containing invalid XML namespaces.' .format(rdfaFileID, str(exc))) else: print( '[ERROR] An error has occurred while processing current file (#{0}): {1}' .format(rdfaFileID, str(exc))) finally: rdfaFileID += 1 data = '' htmlURL = '' if 'WARC-Target-URI:' in line: htmlURL = line.replace('WARC-Target-URI: ', '').replace('\r', '').replace('\n', '') if '<!DOCTYPE html' in line or '<!doctype html' in line or '<html' in line: append = True if append: data = data + line + '\n' iteration_count = iteration_count + 1 return iteration_count