Пример #1
0
 def test_returns_false_on_dict_list_with_empty_or_none_value(self):
     extras = Extras([
         {
             'key': 'bar',
             'value': 'bar-value'
         },
         {
             'key': 'baz',
             'value': 'baz-value'
         },
         {
             'key': 'foo',
             'value': ''
         },
         {
             'key': 'fuz',
             'value': '             '
         },
         {
             'key': 'muz',
             'value': None
         },
     ])
     self.assertFalse(extras.key('foo', disallow_empty=True))
     self.assertFalse(extras.key('fuz', disallow_empty=True))
     self.assertFalse(extras.key('muz', disallow_empty=True))
Пример #2
0
 def test_returns_false_on_dict_list(self):
     extras = Extras([
         {
             'key': 'bar',
             'value': 'bar-value'
         },
         {
             'key': 'baz',
             'value': 'baz-value'
         },
     ])
     self.assertFalse(extras.key('foo'))
Пример #3
0
 def test_returns_true_on_dict_list(self):
     extras = Extras([
         {
             'key': 'foo',
             'value': 'foo-value'
         },
         {
             'key': 'bar',
             'value': 'bar-value'
         },
     ])
     self.assertTrue(extras.key('bar'))
Пример #4
0
    def test_key_on_large_dict_list(self):
        extras_in = [{
            "key":
            "contacts",
            "value":
            "[{'url': 'www.open.nrw.de', 'role': 'vertrieb', 'name': 'Gesch\\u00e4ftsstelle Open.NRW', 'email': '*****@*****.**'}]"
        }, {
            "key":
            "dates",
            "value":
            "[{'date': '2016-06-08T12:31:11+02:00', 'role': 'erstellt'}, {'date': '2014-05-26T12:39:03+02:00', 'role': 'veroeffentlicht'}, {'date': '2016-06-08T12:31:11+02:00', 'role': 'aktualisiert'}]"
        }, {
            "key":
            "images",
            "value":
            "['https://open.nrw/profiles/nrw_ressort/themes/custom/nrw_base/images/grayish-blue/files/koeln_klein.png']"
        }, {
            "key": "metadata_original_portal",
            "value": "http://open.nrw/"
        }, {
            "key": "metadata_transformer",
            "value": ""
        }, {
            "key": "non_open",
            "value": "false"
        }, {
            "key": "opennrw_spatial",
            "value": "Stadt Köln"
        }, {
            "key": "original_groups",
            "value": "['Politik und Wahlen']"
        }, {
            "key":
            "spatial",
            "value":
            "{'type': 'polygon', 'coordinates': [[[6.7838099999999999, 50.825465999999999], [7.1533170000000004, 50.825465999999999], [7.1533170000000004, 51.090167999999998], [6.7838099999999999, 51.090167999999998], [6.7838099999999999, 50.825465999999999]]]}"
        }]

        extras = Extras(extras_in)

        for extra in extras_in:
            self.assertTrue(extras.key(extra['key']))
Пример #5
0
    def handle_duplicates(cls, harvest_object_content):
        '''Compares new dataset with existing and checks, if a dataset should be imported.'''

        method_prefix = 'handle_duplicates: '
        context = cls.build_context()

        remote_dataset = json.loads(harvest_object_content)
        remote_dataset_extras = Extras(remote_dataset['extras'])
        remote_dataset_name = remote_dataset.get('name', '')

        has_orig_id = remote_dataset_extras.key(EXTRAS_KEY_DCT_IDENTIFIER)
        if has_orig_id:
            orig_id = remote_dataset_extras.value(EXTRAS_KEY_DCT_IDENTIFIER)
            if orig_id:
                try:
                    data_dict = {
                        "q": EXTRAS_KEY_DCT_IDENTIFIER + ':"' + orig_id + '"'
                    }
                    # Add filter that local dataset guid is not equal to guid of the remote dataset
                    if (remote_dataset_extras.key('guid')):
                        data_dict[
                            'fq'] = '-guid:"' + remote_dataset_extras.value(
                                'guid') + '"'
                    local_search_result = p.toolkit.get_action(
                        "package_search")(context, data_dict)
                    if local_search_result['count'] == 0:
                        LOGGER.debug('%sDid not find any existing dataset in the database. ' \
                            'Import accepted for %s.', method_prefix, remote_dataset_name)
                        return True
                    elif local_search_result['count'] == 1:
                        LOGGER.debug('%sFound duplicate entry for dataset %s.',
                                     method_prefix, remote_dataset_name)
                        local_dataset = local_search_result['results'][0]
                        local_dataset_extras = Extras(local_dataset['extras'])

                        # TODO : Im Zweifel das CKAN-Feld "metadata_modified" des lokalen Datensatzes nutzen,
                        # falls modified nicht enthalten ist?
                        if remote_dataset_extras.key(EXTRAS_KEY_DCT_MODIFIED) and \
                                local_dataset_extras.key(EXTRAS_KEY_DCT_MODIFIED):
                            return cls.compare_metadata_modified(
                                remote_dataset_extras.value(
                                    EXTRAS_KEY_DCT_MODIFIED),
                                local_dataset_extras.value(
                                    EXTRAS_KEY_DCT_MODIFIED))
                        else:
                            LOGGER.info(
                                '%sFound duplicate entry with the value "%s" in field "identifier", but ' \
                                'remote and/or local dataset does not contain a modified date. ' \
                                '-> Skipping import for %s!',
                                method_prefix, orig_id, remote_dataset_name)
                    else:
                        LOGGER.info('%sFound multiple duplicates with the value "%s" in field ' \
                            '"identifier". -> Skipping import for %s!', method_prefix, orig_id,
                            remote_dataset_name)
                except Exception as exception:
                    LOGGER.error(exception)
            else:
                LOGGER.debug(
                    '%sNo original id in field identifier found. Import accepted for %s.',
                    method_prefix, remote_dataset_name)
                return True
        else:
            LOGGER.debug(
                '%sNo field identifier found. Import accepted for %s.',
                method_prefix, remote_dataset_name)
            return True

        return False
Пример #6
0
 def test_returns_false_on_flat_list_with_empty_or_none_value(self):
     extras = Extras({'dates': 'foo', 'bar': 'baz', 'foo': '', 'fuz': None})
     self.assertFalse(extras.key('foo', disallow_empty=True))
     self.assertFalse(extras.key('fuz', disallow_empty=True))
Пример #7
0
 def test_returns_false_on_flat_list(self):
     extras = Extras({'dates': 'foo', 'bar': 'baz'})
     self.assertFalse(extras.key('foo'))
Пример #8
0
 def test_returns_true_on_flat_list(self):
     extras = Extras({'dates': 'foo', 'bar': 'baz'})
     self.assertTrue(extras.key('bar'))
Пример #9
0
 def test_returns_false_on_empty_extras(self):
     extras = Extras([])
     self.assertFalse(extras.key('foo'))
    def handle_duplicates(harvest_object_content):
        '''Compares new dataset with existing and checks, if a dataset should be imported.'''

        method_prefix = 'handle_duplicates: '
        context = HarvestUtils.build_context()

        remote_dataset = json.loads(harvest_object_content)
        remote_dataset_extras = Extras(remote_dataset['extras'])
        remote_dataset_name = remote_dataset.get('name', '')

        has_orig_id = remote_dataset_extras.key(EXTRAS_KEY_DCT_IDENTIFIER)
        if has_orig_id:
            orig_id = remote_dataset_extras.value(EXTRAS_KEY_DCT_IDENTIFIER)
            # remote dataset contains identifier
            if orig_id:
                try:
                    data_dict = {
                        "q": EXTRAS_KEY_DCT_IDENTIFIER + ':"' + orig_id + '"'
                    }
                    # Add filter that local dataset guid is not equal to guid of the remote dataset
                    if remote_dataset_extras.key('guid'):
                        data_dict[
                            'fq'] = '-guid:"' + remote_dataset_extras.value(
                                'guid') + '"'
                    # search for other datasets with the same identifier
                    local_search_result = p.toolkit.get_action(
                        "package_search")(context, data_dict)
                    if local_search_result['count'] == 0:
                        # no other dataset with the same identifier was found, import accepted
                        LOGGER.debug(u'%sDid not find any existing dataset in the database with ' \
                                     u'Identifier %s. Import accepted for dataset %s.', method_prefix,
                                     orig_id, remote_dataset_name)
                        return True
                    else:
                        # other dataset with the same identifier was found
                        LOGGER.debug(
                            u'%sFound duplicate entries with Identifier %s for dataset %s.',
                            method_prefix, orig_id, remote_dataset_name)
                        remote_is_latest = True
                        local_dataset_has_modified = False
                        latest_local_dataset = {}
                        if not remote_dataset_extras.key(
                                EXTRAS_KEY_DCT_MODIFIED):
                            remote_is_latest = False

                        # compare modified date with all local datasets
                        for local_dataset in local_search_result['results']:
                            local_dataset_extras = Extras(
                                local_dataset['extras'])

                            if local_dataset_extras.key(
                                    EXTRAS_KEY_DCT_MODIFIED):
                                local_dataset_has_modified = True
                                # notice the local dataset with the latest date
                                _set_or_update_latest_dataset(
                                    latest_local_dataset,
                                    local_dataset_extras.value(
                                        EXTRAS_KEY_DCT_MODIFIED),
                                    local_dataset['id'])
                                # compare dct:modified if remote and local dataset contain the field
                                # "modified" and remote dataset is still not detected as older
                                if remote_is_latest and remote_dataset_extras.key(
                                        EXTRAS_KEY_DCT_MODIFIED):
                                    remote_is_latest = HarvestUtils.compare_metadata_modified(
                                        remote_dataset_extras.value(
                                            EXTRAS_KEY_DCT_MODIFIED),
                                        local_dataset_extras.value(
                                            EXTRAS_KEY_DCT_MODIFIED))

                        if remote_is_latest:
                            # Import accepted. Delete all local datasets with the same identifier.
                            LOGGER.debug(u'%sRemote dataset with Identifier %s is the latest. '\
                                         u'Modified date: %s. Import accepted for dataset %s.',
                                         method_prefix, orig_id,
                                         remote_dataset_extras.value(EXTRAS_KEY_DCT_MODIFIED),
                                         remote_dataset_name)
                            packages_deleted = _delete_packages_keep(
                                local_search_result['results'])
                            LOGGER.debug(u'%sDeleted packages: %s',
                                         method_prefix,
                                         ','.join(packages_deleted))
                            return True
                        elif local_dataset_has_modified:
                            # Skip import. Delete local datasets, but keep the dataset with latest date in
                            # the field "modified".
                            LOGGER.info(u'%sRemote dataset with Identifier %s is NOT the latest. '\
                                        u'Modified date: %s. Keep local dataset with ' \
                                        u'latest date in field "modified". Skipping import for dataset %s!',
                                        method_prefix, orig_id,
                                        remote_dataset_extras.value(EXTRAS_KEY_DCT_MODIFIED, 'n/a'),
                                        remote_dataset_name)
                            packages_deleted = _delete_packages_keep(
                                local_search_result['results'],
                                latest_local_dataset)
                            LOGGER.debug(u'%sDeleted packages: %s',
                                         method_prefix,
                                         ','.join(packages_deleted))
                        else:
                            # Skip import, because remote dataset and no other local dataset contains the
                            # field "modified". Delete local datasets, but keep the dataset last modified in
                            # database.
                            LOGGER.info(
                                u'%sFound duplicate entries with the value "%s" in field "identifier", but ' \
                                u'remote and local datasets does not contain a modified date. ' \
                                u'Keep local dataset last modified in database. Skipping import for %s!',
                                method_prefix, orig_id, remote_dataset_name)
                            last_modified_local_dataset = {}
                            for local_dataset in local_search_result[
                                    'results']:
                                # notice the local dataset with the latest date
                                _set_or_update_latest_dataset(
                                    last_modified_local_dataset,
                                    local_dataset.get('metadata_modified',
                                                      None),
                                    local_dataset['id'])
                            packages_deleted = _delete_packages_keep(
                                local_search_result['results'],
                                last_modified_local_dataset)
                            LOGGER.debug(u'%sDeleted packages: %s',
                                         method_prefix,
                                         ','.join(packages_deleted))
                except Exception as exception:
                    LOGGER.error(exception)
            else:
                LOGGER.debug(
                    u'%sNo original id in field identifier found. Import accepted for dataset %s.',
                    method_prefix, remote_dataset_name)
                return True
        else:
            LOGGER.debug(
                u'%sNo field identifier found. Import accepted for dataset %s.',
                method_prefix, remote_dataset_name)
            return True

        return False