Exemplo n.º 1
0
    def test_parse_without_pagination(self):

        data = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
         xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#">
        <rdfs:SomeClass rdf:about="http://example.org">
            <rdfs:label>Some label</rdfs:label>
        </rdfs:SomeClass>
        </rdf:RDF>
        '''

        p = RDFParser()

        p.parse(data)

        eq_(p.next_page(), None)
Exemplo n.º 2
0
    def test_parse_pagination_last_page(self):

        data = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
         xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
         xmlns:hydra="http://www.w3.org/ns/hydra/core#">
         <hydra:PagedCollection rdf:about="http://example.com/catalog.xml?page=3">
            <hydra:totalItems rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">245</hydra:totalItems>
            <hydra:lastPage>http://example.com/catalog.xml?page=3</hydra:lastPage>
            <hydra:itemsPerPage rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">100</hydra:itemsPerPage>
            <hydra:firstPage>http://example.com/catalog.xml?page=1</hydra:firstPage>
            <hydra:previousPage>http://example.com/catalog.xml?page=2</hydra:previousPage>
        </hydra:PagedCollection>
        </rdf:RDF>
        '''

        p = RDFParser()

        p.parse(data)

        eq_(p.next_page(), None)
Exemplo n.º 3
0
    def test_parse_pagination_next_page(self):

        data = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
         xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
         xmlns:hydra="http://www.w3.org/ns/hydra/core#">
         <hydra:PagedCollection rdf:about="http://example.com/catalog.xml?page=1">
            <hydra:totalItems rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">245</hydra:totalItems>
            <hydra:lastPage>http://example.com/catalog.xml?page=3</hydra:lastPage>
            <hydra:itemsPerPage rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">100</hydra:itemsPerPage>
            <hydra:nextPage>http://example.com/catalog.xml?page=2</hydra:nextPage>
            <hydra:firstPage>http://example.com/catalog.xml?page=1</hydra:firstPage>
        </hydra:PagedCollection>
        </rdf:RDF>
        '''

        p = RDFParser()

        p.parse(data)

        assert p.next_page() == 'http://example.com/catalog.xml?page=2'
Exemplo n.º 4
0
    def gather_stage(self, harvest_job):

        log.debug('In DCATRDFHarvester gather_stage')

        rdf_format = None
        if harvest_job.source.config:
            rdf_format = json.loads(
                harvest_job.source.config).get("rdf_format")

        # Get file contents of first page
        next_page_url = harvest_job.source.url

        guids_in_source = []
        object_ids = []
        last_content_hash = None

        while next_page_url:
            for harvester in p.PluginImplementations(IDCATRDFHarvester):
                next_page_url, before_download_errors = harvester.before_download(
                    next_page_url, harvest_job)

                for error_msg in before_download_errors:
                    self._save_gather_error(error_msg, harvest_job)

                if not next_page_url:
                    return []

            content, rdf_format = self._get_content_and_type(
                next_page_url, harvest_job, 1, content_type=rdf_format)

            content_hash = hashlib.md5()
            content_hash.update(content)

            if last_content_hash:
                if content_hash.digest() == last_content_hash.digest():
                    log.warning(
                        'Remote content was the same even when using a paginated URL, skipping'
                    )
                    break
            else:
                last_content_hash = content_hash

            # TODO: store content?
            for harvester in p.PluginImplementations(IDCATRDFHarvester):
                content, after_download_errors = harvester.after_download(
                    content, harvest_job)

                for error_msg in after_download_errors:
                    self._save_gather_error(error_msg, harvest_job)

            if not content:
                return []

            # TODO: profiles conf
            parser = RDFParser()

            try:
                parser.parse(content, _format=rdf_format)
            except RDFParserException, e:
                self._save_gather_error(
                    'Error parsing the RDF file: {0}'.format(e), harvest_job)
                return []

            for dataset in parser.datasets():
                if not dataset.get('name'):
                    dataset['name'] = self._gen_new_name(dataset['title'])

                # Unless already set by the parser, get the owner organization (if any)
                # from the harvest source dataset
                if not dataset.get('owner_org'):
                    source_dataset = model.Package.get(harvest_job.source.id)
                    if source_dataset.owner_org:
                        dataset['owner_org'] = source_dataset.owner_org

                # Try to get a unique identifier for the harvested dataset
                guid = self._get_guid(dataset)

                if not guid:
                    self._save_gather_error(
                        'Could not get a unique identifier for dataset: {0}'.
                        format(dataset), harvest_job)
                    continue

                dataset['extras'].append({'key': 'guid', 'value': guid})
                guids_in_source.append(guid)

                obj = HarvestObject(guid=guid,
                                    job=harvest_job,
                                    content=json.dumps(dataset))

                obj.save()
                object_ids.append(obj.id)

            # get the next page
            next_page_url = parser.next_page()
Exemplo n.º 5
0
    def gather_stage(self, harvest_job):

        log.debug('In DCATRDFHarvester gather_stage')

        rdf_format = None
        if harvest_job.source.config:
            rdf_format = json.loads(
                harvest_job.source.config).get("rdf_format")

        # Get file contents of first page
        next_page_url = harvest_job.source.url

        guids_in_source = []
        object_ids = []
        last_content_hash = None
        self._names_taken = []

        while next_page_url:
            for harvester in p.PluginImplementations(IDCATRDFHarvester):
                next_page_url, before_download_errors = harvester.before_download(
                    next_page_url, harvest_job)

                for error_msg in before_download_errors:
                    self._save_gather_error(error_msg, harvest_job)

                if not next_page_url:
                    return []

            content, rdf_format = self._get_content_and_type(
                next_page_url, harvest_job, 1, content_type=rdf_format)

            content_hash = hashlib.md5()
            if content:
                if six.PY2:
                    content_hash.update(content)
                else:
                    content_hash.update(content.encode('utf8'))

            if last_content_hash:
                if content_hash.digest() == last_content_hash.digest():
                    log.warning(
                        'Remote content was the same even when using a paginated URL, skipping'
                    )
                    break
            else:
                last_content_hash = content_hash

            # TODO: store content?
            for harvester in p.PluginImplementations(IDCATRDFHarvester):
                content, after_download_errors = harvester.after_download(
                    content, harvest_job)

                for error_msg in after_download_errors:
                    self._save_gather_error(error_msg, harvest_job)

            if not content:
                return []

            # TODO: profiles conf
            parser = RDFParser()

            try:
                parser.parse(content, _format=rdf_format)
            except RDFParserException as e:
                self._save_gather_error(
                    'Error parsing the RDF file: {0}'.format(e), harvest_job)
                return []

            for harvester in p.PluginImplementations(IDCATRDFHarvester):
                parser, after_parsing_errors = harvester.after_parsing(
                    parser, harvest_job)

                for error_msg in after_parsing_errors:
                    self._save_gather_error(error_msg, harvest_job)

            if not parser:
                return []

            try:

                source_dataset = model.Package.get(harvest_job.source.id)

                for dataset in parser.datasets():
                    if not dataset.get('name'):
                        dataset['name'] = self._gen_new_name(dataset['title'])
                    if dataset['name'] in self._names_taken:
                        suffix = len([
                            i for i in self._names_taken
                            if i.startswith(dataset['name'] + '-')
                        ]) + 1
                        dataset['name'] = '{}-{}'.format(
                            dataset['name'], suffix)
                    self._names_taken.append(dataset['name'])

                    # Unless already set by the parser, get the owner organization (if any)
                    # from the harvest source dataset
                    if not dataset.get('owner_org'):
                        if source_dataset.owner_org:
                            dataset['owner_org'] = source_dataset.owner_org

                    # Try to get a unique identifier for the harvested dataset
                    guid = self._get_guid(dataset,
                                          source_url=source_dataset.url)

                    if not guid:
                        self._save_gather_error(
                            'Could not get a unique identifier for dataset: {0}'
                            .format(dataset), harvest_job)
                        continue

                    dataset['extras'].append({'key': 'guid', 'value': guid})
                    guids_in_source.append(guid)

                    obj = HarvestObject(guid=guid,
                                        job=harvest_job,
                                        content=json.dumps(dataset))

                    obj.save()
                    object_ids.append(obj.id)
            except Exception as e:
                self._save_gather_error(
                    'Error when processsing dataset: %r / %s' %
                    (e, traceback.format_exc()), harvest_job)
                return []

            # get the next page
            next_page_url = parser.next_page()

        # Check if some datasets need to be deleted
        object_ids_to_delete = self._mark_datasets_for_deletion(
            guids_in_source, harvest_job)

        object_ids.extend(object_ids_to_delete)

        return object_ids
Exemplo n.º 6
0
    def gather_stage(self, harvest_job):

        log.debug('In DCATRDFHarvester gather_stage')

        rdf_format = None
        if harvest_job.source.config:
            rdf_format = json.loads(harvest_job.source.config).get("rdf_format")

        # Get file contents of first page
        next_page_url = harvest_job.source.url

        guids_in_source = []
        object_ids = []

        while next_page_url:
            for harvester in p.PluginImplementations(IDCATRDFHarvester):
                next_page_url, before_download_errors = harvester.before_download(next_page_url, harvest_job)

                for error_msg in before_download_errors:
                    self._save_gather_error(error_msg, harvest_job)

                if not next_page_url:
                    return []

            content, rdf_format = self._get_content_and_type(next_page_url, harvest_job, 1, content_type=rdf_format)

            # TODO: store content?
            for harvester in p.PluginImplementations(IDCATRDFHarvester):
                content, after_download_errors = harvester.after_download(content, harvest_job)

                for error_msg in after_download_errors:
                    self._save_gather_error(error_msg, harvest_job)

            if not content:
                return []

            # TODO: profiles conf
            parser = RDFParser()

            try:
                parser.parse(content, _format=rdf_format)
            except RDFParserException, e:
                self._save_gather_error('Error parsing the RDF file: {0}'.format(e), harvest_job)
                return []

            for dataset in parser.datasets():
                if not dataset.get('name'):
                    dataset['name'] = self._gen_new_name(dataset['title'])

                # Unless already set by the parser, get the owner organization (if any)
                # from the harvest source dataset
                if not dataset.get('owner_org'):
                    source_dataset = model.Package.get(harvest_job.source.id)
                    if source_dataset.owner_org:
                        dataset['owner_org'] = source_dataset.owner_org

                # Try to get a unique identifier for the harvested dataset
                guid = self._get_guid(dataset)

                if not guid:
                    self._save_gather_error('Could not get a unique identifier for dataset: {0}'.format(dataset),
                                            harvest_job)
                    continue

                dataset['extras'].append({'key': 'guid', 'value': guid})
                guids_in_source.append(guid)

                obj = HarvestObject(guid=guid, job=harvest_job,
                                    content=json.dumps(dataset))

                obj.save()
                object_ids.append(obj.id)

            # get the next page
            next_page_url = parser.next_page()