示例#1
0
    def test_externalize(self):
        sample_organization = {
            "id": "https://ratsinfo.leipzig.de/bi/oparl/1.0/organizations.asp?typ=gr&id=2286",
            "type": "https://schema.oparl.org/1.0/Organization",
            "name": "Beirat für Psychiatrie",
            "startDate": "2000-01-01",
            "endDate": "",
            "meeting": "https://ratsinfo.leipzig.de/bi/oparl/1.0/meetings.asp?organization=2286",
            "membership": [
                "https://ratsinfo.leipzig.de/bi/oparl/1.0/memberships.asp?typ=mg&id=1414"
            ],
            "location": {
                "id": "https://ratsinfo.leipzig.de/bi/oparl/1.0/locations.asp?id=32286",
                "type": "https://schema.oparl.org/1.0/Location",
                "description": "Friedrich-Ebert-Str. 19a, 04109 Leipzig",
                "street_address": "Friedrich-Ebert-Str. 19a",
                "postal_code": "04109",
                "subLocality": "",
                "locality": "Leipzig",
            },
            "created": "2000-01-01T12:00:00+01:00",
            "modified": "2018-04-10T12:14:31+02:00",
        }

        [location, organization] = list(externalize(sample_organization))
        self.assertEqual(organization.data["location"], location.data["id"])
示例#2
0
    def _process_element(self, element: JSON) -> List[str]:
        keys_of_interest = set()  # type: Set[str]
        new = list(externalize(element, keys_of_interest))
        # Find the ids of removed embedded objects
        # This way is not elegant, but it gets the job done.
        old_element = CachedObject.objects.filter(url=element["id"]).first()
        old_urls = set()
        if old_element:
            for key in keys_of_interest:
                if isinstance(old_element.data.get(key), list):
                    old_urls.update(old_element.data[key])
                elif isinstance(old_element.data.get(key), str):
                    old_urls.add(old_element.data[key])

        removed = old_urls - set([i.url for i in new])
        fetch_later = CachedObject.objects.filter(url__in=removed).values_list(
            "url", flat=True)
        for instance in new:
            existing = CachedObject.objects.filter(url=instance.url).first()
            if existing:
                if existing.data == instance.data:
                    continue
                else:
                    existing.data = instance.data
                    existing.to_import = True
                    existing.save()
            else:
                instance.save()
        return fetch_later
示例#3
0
def test_externalize_missing_id(caplog):
    """In http://buergerinfo.ulm.de/oparl/bodies/0001/meetings/11445, the embedded location does not have an id"""
    json_in = {
        "id":
        "http://buergerinfo.ulm.de/oparl/bodies/0001/meetings/11445",
        "type":
        "https://schema.oparl.org/1.1/Meeting",
        "name":
        "Klausurtagung des Gemeinderats",
        "start":
        "2021-06-12T09:00:00+02:00",
        "end":
        "2021-06-12T00:00:00+02:00",
        "location": {
            "description": "Ulm-Messe,"
        },
        "organization":
        ["http://buergerinfo.ulm.de/oparl/bodies/0001/organizations/gr/1"],
        "created":
        "2020-11-11T09:47:04+01:00",
        "modified":
        "2020-11-11T09:48:13+01:00",
    }
    # Check that location has been removed but everything else remained the same
    json_out = json_in.copy()
    del json_out["location"]

    externalized = externalize(json_in)
    assert len(externalized) == 1
    assert externalized[0].data == json_out
    assert caplog.messages == [
        "Embedded object at location in "
        "http://buergerinfo.ulm.de/oparl/bodies/0001/meetings/11445 does not have an "
        "id, skipping: {'description': 'Ulm-Messe,'}"
    ]
示例#4
0
    def import_anything(self, oparl_id: str) -> DefaultFields:
        """ Hacky metaprogramming to import any object based on its id """
        logging.info("Importing single object {}".format(oparl_id))

        to_return = None

        loaded = self.loader.load(oparl_id)
        # When a resourced moved, the use specified id might be different from the object's id
        # The loader prints a warning in that case
        oparl_id = loaded["id"]
        externalized = list(externalize(loaded))
        # To avoid endless recursion, we sort the objects so that if A links to B then B gets imported first
        externalized.sort(key=lambda key: import_order.index(
            getattr(models, key.data["type"].split("/")[-1])))

        for entry in externalized:
            instance = self.import_any_externalized(entry.data)

            defaults = {
                "url": entry.url,
                "data": entry.data,
                "oparl_type": entry.oparl_type,
                "to_import": False,
            }
            CachedObject.objects.update_or_create(url=entry.url,
                                                  defaults=defaults)

            if entry.url == oparl_id:
                to_return = instance

        assert to_return, "Missing object for {}".format(oparl_id)
        return to_return
示例#5
0
    def fetch_list_initial(self, url: str) -> None:
        """Saves a complete external list as flattened json to the database"""
        logger.info(f"Fetching List {url}")

        timestamp = timezone.now()
        next_url = url
        all_objects = set()
        while next_url:
            logger.info(f"Fetching {next_url}")
            response = self.loader.load(next_url)

            objects = set()

            for element in response["data"]:
                externalized = externalize(element)
                for i in externalized:
                    if not i.data.get("deleted") and i not in all_objects:
                        objects.update(externalized)

            next_url = response["links"].get("next")

            # We can't have the that block outside the loop due to mysql's max_allowed_packet, manifesting
            # "MySQL server has gone away" https://stackoverflow.com/a/36637118/3549270
            # We'll be able to solve this a lot better after the django 2.2 update with ignore_conflicts
            try:
                # Also avoid "MySQL server has gone away" errors due to timeouts
                # https://stackoverflow.com/a/32720475/3549270
                db.close_old_connections()
                # The test are run with sqlite, which failed here with a TransactionManagementError:
                # "An error occurred in the current transaction.
                # You can't execute queries until the end of the 'atomic' block."
                # That's why we build our own atomic block
                if settings.TESTING:
                    with transaction.atomic():
                        saved_objects = CachedObject.objects.bulk_create(
                            objects)
                else:
                    saved_objects = CachedObject.objects.bulk_create(objects)
            except IntegrityError:
                saved_objects = set()
                for i in objects:
                    defaults = {
                        "data": i.data,
                        "to_import": True,
                        "oparl_type": i.oparl_type,
                    }
                    saved_objects.add(
                        CachedObject.objects.update_or_create(
                            url=i.url, defaults=defaults)[0])

            all_objects.update(saved_objects)
        logger.info(f"Found {len(all_objects)} objects in {url}")
        ExternalList(url=url, last_update=timestamp).save()
示例#6
0
    def import_anything(
            self,
            oparl_id: str,
            object_type: Optional[Type[T]] = None) -> DefaultFields:
        """Hacky metaprogramming to import any object based on its id"""
        logging.info("Importing single object {}".format(oparl_id))

        to_return = None

        try:
            loaded = self.loader.load(oparl_id)
        except HTTPError as e:
            logger.error(f"Failed to load {oparl_id}: {e}")
            # This is a horrible workaround for broken oparl implementations
            # See test_missing.py
            if object_type and issubclass(object_type, DummyInterface):
                logger.error(f"Using a dummy for {oparl_id}. THIS IS BAD.")
                # noinspection PyTypeChecker
                dummy: T = object_type.dummy(oparl_id)
                dummy.save()
                return dummy
            else:
                raise

        # When a resourced moved, the use specified id might be different from the object's id
        # The loader prints a warning in that case
        oparl_id = loaded["id"]
        externalized = list(externalize(loaded))
        # To avoid endless recursion, we sort the objects so that if A links to B then B gets imported first
        externalized.sort(key=lambda key: import_order.index(
            getattr(models, key.data["type"].split("/")[-1])))

        for entry in externalized:
            instance = self.import_any_externalized(entry.data)

            defaults = {
                "url": entry.url,
                "data": entry.data,
                "oparl_type": entry.oparl_type,
                "to_import": False,
            }
            CachedObject.objects.update_or_create(url=entry.url,
                                                  defaults=defaults)

            if entry.url == oparl_id:
                to_return = instance

        assert to_return, "Missing object for {}".format(oparl_id)
        return to_return
    def setUpClass(cls):
        super().setUpClass()
        cls.api_data = {}
        cls.loader = MockLoader()
        cls.loader.api_data = cls.api_data
        for file in os.listdir(cls.dummy_data):
            if not file.endswith(".json"):
                continue

            with open(os.path.join(cls.dummy_data, file)) as fp:
                data = json.load(fp)
                cls.api_data[data["id"]] = data
                for entry in externalize(data):
                    if entry.data["id"] not in cls.api_data:
                        cls.api_data[entry.data["id"]] = entry.data

        # Used by test_location_default_body
        body = Body()
        body.short_name = "München"
        cls.converter = JsonToDb(cls.loader, default_body=body)
        cls.converter.warn_missing = False
        cls.utils = Utils()
    def import_anything(
        self, oparl_id: str, object_type: Optional[Type[T]] = None
    ) -> DefaultFields:
        """Hacky metaprogramming to import any object based on its id"""
        logging.info(f"Importing single object {oparl_id}")

        try:
            loaded = self.loader.load(oparl_id)
        except HTTPError as e:
            logger.error(
                f"Failed to load {oparl_id}. Using a dummy instead. THIS IS BAD: {e}"
            )
            return self._make_dummy(oparl_id, object_type)

        if not isinstance(loaded, dict):
            logger.error(
                f"JSON loaded from {oparl_id} is not a dict/object. Using a dummy instead. THIS IS BAD"
            )
            return self._make_dummy(oparl_id, object_type)
        if "type" not in loaded:
            if object_type:
                loaded["type"] = "https://schema.oparl.org/1.0/" + object_type.__name__
                logger.warning(
                    f"Object loaded from {oparl_id} has no type field, inferred to {loaded['type']}"
                )
            else:
                raise RuntimeError(
                    f"The object {oparl_id} has not type field and object_type wasn't given"
                )

        if "id" not in loaded:
            logger.warning(
                f"Object loaded from {oparl_id} has no id field, setting id to url"
            )
            loaded["id"] = oparl_id
        oparl_id = loaded["id"]
        externalized = list(externalize(loaded))
        # To avoid endless recursion, we sort the objects so that if A links to B then B gets imported first
        externalized.sort(
            key=lambda key: import_order.index(
                getattr(models, key.data["type"].split("/")[-1])
            )
        )

        to_return = None
        for entry in externalized:
            instance = self.import_any_externalized(entry.data)

            defaults = {
                "url": entry.url,
                "data": entry.data,
                "oparl_type": entry.oparl_type,
                "to_import": False,
            }
            CachedObject.objects.update_or_create(url=entry.url, defaults=defaults)

            if entry.url == oparl_id:
                to_return = instance

        assert to_return, f"Missing object for {oparl_id}"
        return to_return