Python RDFaExtractor.extract示例，extruct.rdfa.RDFaExtractor.extract Python示例

示例#1

0

显示文件

文件： test_rdfa.py 项目： rkhichar94/extruct

    def test_wikipedia_xhtml_rdfa_no_prefix(self):
        body = get_testdata('misc', 'Portfolio_Niels_Lubberman.html')
        expected = json.loads(
            get_testdata('misc',
                         'Portfolio_Niels_Lubberman.json').decode('UTF-8'))

        rdfae = RDFaExtractor()
        data = rdfae.extract(body, base_url='http://nielslubberman.nl/drupal/')

        self.assertJsonLDEqual(data, expected)

示例#2

0

显示文件

文件： test_rdfa.py 项目： zanachka/extruct

    def test_expanded_opengraph_support(self):
        body = get_testdata('misc','expanded_OG_support_test.html')
        expected = json.loads(
                   get_testdata('misc','expanded_OG_support_test.json'
                   ).decode('UTF-8'))

        rdfae = RDFaExtractor()
        data = rdfae.extract(body, base_url='http://www.example.com/index.html')

        self.assertJsonLDEqual(data,expected)

示例#3

0

显示文件

文件： test_rdfa.py 项目： bmccarthy/extruct

    def test_wikipedia_xhtml_rdfa(self):
        fileprefix = 'xhtml+rdfa'
        body = get_testdata('wikipedia', fileprefix + '.html').decode('UTF-8')
        expected = json.loads(
            get_testdata('wikipedia',
                         fileprefix + '.expanded.json').decode('UTF-8'))

        rdfae = RDFaExtractor()
        data = rdfae.extract(body, url='http://www.exaple.com/index.html')

        self.assertJsonLDEqual(data, expected)

示例#4

0

显示文件

文件： test_rdfa.py 项目： scrapinghub/extruct

    def test_wikipedia_xhtml_rdfa(self):
        fileprefix = 'xhtml+rdfa'
        body = get_testdata('wikipedia', fileprefix + '.html').decode('UTF-8')
        expected = json.loads(
                get_testdata('wikipedia', fileprefix + '.expanded.json'
            ).decode('UTF-8'))

        rdfae = RDFaExtractor()
        data = rdfae.extract(body, url='http://www.exaple.com/index.html')

        self.assertJsonLDEqual(data, expected)

示例#5

0

显示文件

文件： test_rdfa.py 项目： scrapinghub/extruct

    def test_w3c_rdf11primer(self):
        for i in [14]:
            fileprefix = 'w3c.rdf11primer.example{:03d}'.format(i)
            body = get_testdata('w3crdfa', fileprefix + '.html').decode('UTF-8')
            expected = json.loads(
                    get_testdata('w3crdfa', fileprefix + '.expanded.json'
                ).decode('UTF-8'))

            rdfae = RDFaExtractor()
            data = rdfae.extract(body, url='http://www.exaple.com/index.html')
            self.assertJsonLDEqual(data, expected)

示例#6

0

显示文件

    def test_w3c_rdf11primer(self):
        for i in [14]:
            fileprefix = 'w3c.rdf11primer.example{:03d}'.format(i)
            body = get_testdata('w3crdfa', fileprefix + '.html')
            expected = json.loads(
                get_testdata('w3crdfa',
                             fileprefix + '.expanded.json').decode('UTF-8'))

            rdfae = RDFaExtractor()
            data = rdfae.extract(body, url='http://www.example.com/index.html')
            self.assertJsonLDEqual(data, expected)

示例#7

0

显示文件

文件： test_rdfa.py 项目： zanachka/extruct

    def test_w3c_rdfaprimer(self):
        for i in [5, 6, 7, 8, 9, 10, 11, 15]:
            fileprefix = 'w3c.rdfaprimer.example{:03d}'.format(i)
            print(fileprefix)
            body = get_testdata('w3crdfa', fileprefix + '.html')
            expected = json.loads(
                       get_testdata('w3crdfa', fileprefix + '.expanded.json'
                       ).decode('UTF-8'))

            rdfae = RDFaExtractor()
            data = rdfae.extract(body, base_url='http://www.example.com/index.html')
            self.assertJsonLDEqual(data, expected)

            # This is for testing that the fix to issue 116 does not affect
            # severely rdfa output even in a presence of a bug in the code
            def mocked_fix_order(x, y, z):
                raise Exception()

            rdfae._fix_order = mocked_fix_order
            data = rdfae.extract(body, base_url='http://www.example.com/index.html')
            self.assertJsonLDEqual(data, expected)

示例#8

0

显示文件

文件： html_metadata_extractor.py 项目： xkgoodbest/etk

    def extract(self, html_text: str,
                extract_title: bool = False,
                extract_meta: bool = False,
                extract_microdata: bool = False,
                microdata_base_url: str = "",
                extract_json_ld: bool = False,
                extract_rdfa: bool = False,
                rdfa_base_url: str = "") \
            -> List[Extraction]:
        """
        Args:
            html_text (str): input html string to be extracted
            extract_title (bool): True if string of 'title' tag needs to be extracted, return as { "title": "..." }
            extract_meta (bool): True if string of 'meta' tags needs to be extracted, return as { "meta": { "author": "...", ...}}
            extract_microdata (bool): True if microdata needs to be extracted, returns as { "microdata": [...] }
            microdata_base_url (str): base namespace url for microdata, empty string if no base url is specified
            extract_json_ld (bool): True if json-ld needs to be extracted, return as { "json-ld": [...] }
            extract_rdfa (bool): True if rdfs needs to be extracted, returns as { "rdfa": [...] }
            rdfa_base_url (str): base namespace url for rdfa, empty string if no base url is specified

        Returns:
            List[Extraction]: the list of extraction or the empty list if there are no matches.
        """
        res = list()
        soup = BeautifulSoup(html_text, 'html.parser')

        if soup.title and extract_title:
            title = self._wrap_data("title", soup.title.string.encode('utf-8').decode('utf-8'))
            res.append(title)

        if soup.title and extract_meta:
            meta_content = self._wrap_meta_content(soup.find_all("meta"))
            meta_data = self._wrap_data("meta", meta_content)
            res.append(meta_data)

        if extract_microdata:
            mde = MicrodataExtractor()
            mde_data = self._wrap_data("microdata", mde.extract(html_text, microdata_base_url))
            res.append(mde_data)

        if extract_json_ld:
            jslde = JsonLdExtractor()
            jslde_data = self._wrap_data("json-ld", jslde.extract(html_text))
            res.append(jslde_data)

        if extract_rdfa:
            rdfae = RDFaExtractor()
            rdfae_data = self._wrap_data("rdfa", rdfae.extract(html_text, rdfa_base_url))
            res.append(rdfae_data)

        return res

示例#9

0

显示文件

文件： test_rdfa.py 项目： scrapinghub/extruct

    def test_w3c_rdfaprimer(self):
        for i in [5, 6, 7, 8, 9, 10, 11, 15]:
            fileprefix = 'w3c.rdfaprimer.example{:03d}'.format(i)
            print(fileprefix)
            body = get_testdata('w3crdfa', fileprefix + '.html').decode('UTF-8')
            expected = json.loads(
                    get_testdata('w3crdfa', fileprefix + '.expanded.json'
                ).decode('UTF-8'))

            rdfae = RDFaExtractor()
            data = rdfae.extract(body, url='http://www.example.com/index.html')
            print("extracted:\n%s" % pformat(tupleize(data)))
            print("expected:\n%s" % pformat(tupleize(expected)))
            print("extracted:\n%s" % self.prettify(data))
            print("expected:\n%s" % self.prettify(expected))
            self.assertJsonLDEqual(data, expected)

示例#10

0

显示文件

    def test_w3c_rdfaprimer(self):
        for i in [5, 6, 7, 8, 9, 10, 11, 15]:
            fileprefix = 'w3c.rdfaprimer.example{:03d}'.format(i)
            print(fileprefix)
            body = get_testdata('w3crdfa', fileprefix + '.html')
            expected = json.loads(
                get_testdata('w3crdfa',
                             fileprefix + '.expanded.json').decode('UTF-8'))

            rdfae = RDFaExtractor()
            data = rdfae.extract(body, url='http://www.example.com/index.html')
            print("extracted:\n%s" % pformat(tupleize(data)))
            print("expected:\n%s" % pformat(tupleize(expected)))
            print("extracted:\n%s" % self.prettify(data))
            print("expected:\n%s" % self.prettify(expected))
            self.assertJsonLDEqual(data, expected)

示例#11

0

显示文件

    def extract(self, html_text: str,
                extract_title: bool = False,
                extract_meta: bool = False,
                extract_microdata: bool = False,
                extract_json_ld: bool = False,
                extract_rdfa: bool = False) \
            -> List[Extraction]:

        res = list()
        soup = BeautifulSoup(html_text, 'html.parser')

        if soup.title and extract_title:
            title = self.wrap_data(
                "title",
                soup.title.string.encode('utf-8').decode('utf-8'))
            res.append(title)

        if soup.title and extract_meta:
            meta_content = self.wrap_meta_content(soup.find_all("meta"))
            meta_data = self.wrap_data("meta", meta_content)
            res.append(meta_data)

        if extract_microdata:
            mde = MicrodataExtractor()
            mde_data = self.wrap_data("microdata", mde.extract(html_text))
            res.append(mde_data)

        if extract_json_ld:
            jslde = JsonLdExtractor()
            jslde_data = self.wrap_data("json-ld", jslde.extract(html_text))
            res.append(jslde_data)

        if extract_rdfa:
            rdfae = RDFaExtractor()
            rdfae_data = self.wrap_data("rdfa", rdfae.extract(html_text))
            res.append(rdfae_data)

        return res

示例#12

0

显示文件

文件： extruct_helper.py 项目： ADJet1437/ScrapyProject

def extract_all_rdfa(response):
    rdfa_extractor = RDFaExtractor()
    return rdfa_extractor.extract(response.text, url=response.url)

示例#13

0

显示文件

class RISJMetadataExtractor(object):
    """An extruct-based metadata extractor"""

    # TODO: Extend to microdata and RDFa, replacing bespoke xpath code. Then
    #       test on body of crawlers!
    def __init__(self, response, microdata=False, jsonld=False, rdfa=False):
        self.response = response
        self.microdata = microdata
        self.jsonld = jsonld
        self.rdfa = rdfa

        if rdfa:
            try:
                self.rdfae = RDFaExtractor()
                self.rdfadata = self.rdfae.extract(self.response.text,
                                                   url=self.response.url)
            except JSONDecodeError:
                pass
        if microdata:
            try:
                self.mde = MicrodataExtractor()
                self.mdedata = self.mde.extract(self.response.text)
            except JSONDecodeError:
                pass
        if jsonld:
            try:
                self.jlde = JsonLdExtractor()
                self.jldata = self.jlde.extract(self.response.text)
            except (JSONDecodeError, TypeError):
                self.jldata = []
            finally:
                # Sometimes we get this in the meta dict from RISJExtractJSONLD
                self.jldata.extend(self.response.meta.get('json-ld', []))

    def extract_newsarticle_schemaorg(self,
                                      microdata=None,
                                      jsonld=None,
                                      rdfa=None):
        """Extract schema.org NewsArticle metadata, encoded using any
           supported metadata format. Note that we only try to extract the
           *first* block of NewsArticle data for each method (which is then
           combined with the first extracted from other methods if more than
           one is selected."""
        if microdata is None:
            microdata = self.microdata
        if jsonld is None:
            jsonld = self.jsonld
        if rdfa is None:
            rdfa = self.rdfa

        outd = {}
        if jsonld:
            for d in self.jldata:
                #                logger.debug('Analysing JSON-LD data: '+pformat(d))
                try:
                    if (re.match(r'https?://schema.org/?', d['@context'])
                            and d['@type'] == 'NewsArticle'):
                        outd.update(d)
                except (KeyError, TypeError):
                    continue
        if microdata:
            for d in self.mdedata:
                logger.debug('Analysing W3C microdata: ' + pformat(d))
                if re.match(r'https?://schema.org/NewsArticle/?',
                            d.get('type', '')):
                    outd.update(d)
        if rdfa:
            raise NotImplementedError
#        logger.debug('Returning schema.org NewsArticle: '+pformat(outd))
        return outd

示例#14

0

显示文件

def get_rdfa_from_warc(warc_file_no, path):
    global iteration_count
    global report_at_every_nth_step

    rdfaFileID = 1
    htmlURL = ''
    data = ''
    append = False

    rdfaExtractor = RDFaExtractor()

    if not os.path.exists('RDFa Files\\WARC_{0}'.format(warc_file_no)):
        os.makedirs('RDFa Files\\WARC_{0}'.format(warc_file_no))

    if not os.path.exists('XML Files\\WARC_{0}'.format(warc_file_no)):
        os.makedirs('XML Files\\WARC_{0}'.format(warc_file_no))

    print('[INFO/PROGRESS] The file being processed: {0}'.format(path))

    with open(path, encoding='utf-8', errors='replace') as file:
        for line in file:
            if debug and iteration_count % report_at_every_nth_step == 0:
                print("[DEBUG/PROGRESS] Processing line #{0:n}".format(
                    iteration_count))

            if 'WARC/1.0' in line and append:
                append = False

                try:
                    rdfaData = rdfaExtractor.extract(data, base_url=htmlURL)

                    if rdfaData != []:
                        with open('RDFa Files\\WARC_{0}\\RDFa_{1}.txt'.format(
                                warc_file_no, rdfaFileID),
                                  'w',
                                  encoding='utf-8') as f:
                            f.write('URL: {0}\n\n'.format(htmlURL))
                            f.write(str(rdfaData))
                            f.close()

                            ConvertToXML.convertInstant(
                                str(rdfaData),
                                "XML Files\\WARC_{0}\\RDFa_{1}.xml".format(
                                    warc_file_no, rdfaFileID))
                            if debug:
                                print(
                                    "[DEBUG/PROGRESS] Processed file #{0} at URI {1} successfully"
                                    .format(rdfaFileID, htmlURL))
                except json.decoder.JSONDecodeError as jde:
                    print(
                        '[ERROR] Current file (#{0}) could not be converted to RDF/XML: {1} | This JSON-LD may be invalid.'
                        .format(rdfaFileID, str(jde)))
                except lxml.etree.ParserError as pe:
                    print(
                        '[ERROR] Current file (#{0}) could not be converted to RDF/XML: {1} | This file may not have a valid RDFa representation.'
                        .format(rdfaFileID, str(pe)))
                except Exception as exc:
                    if str(exc).startswith('Can\'t split'):
                        print(
                            '[ERROR] Current file (#{0}) could not be converted to RDF/XML: {1} | This file may be containing invalid XML namespaces.'
                            .format(rdfaFileID, str(exc)))
                    else:
                        print(
                            '[ERROR] An error has occurred while processing current file (#{0}): {1}'
                            .format(rdfaFileID, str(exc)))
                finally:
                    rdfaFileID += 1
                    data = ''
                    htmlURL = ''

            if 'WARC-Target-URI:' in line:
                htmlURL = line.replace('WARC-Target-URI: ',
                                       '').replace('\r', '').replace('\n', '')

            if '<!DOCTYPE html' in line or '<!doctype html' in line or '<html' in line:
                append = True

            if append:
                data = data + line + '\n'

            iteration_count = iteration_count + 1

    return iteration_count