def test_externalize(self): sample_organization = { "id": "https://ratsinfo.leipzig.de/bi/oparl/1.0/organizations.asp?typ=gr&id=2286", "type": "https://schema.oparl.org/1.0/Organization", "name": "Beirat für Psychiatrie", "startDate": "2000-01-01", "endDate": "", "meeting": "https://ratsinfo.leipzig.de/bi/oparl/1.0/meetings.asp?organization=2286", "membership": [ "https://ratsinfo.leipzig.de/bi/oparl/1.0/memberships.asp?typ=mg&id=1414" ], "location": { "id": "https://ratsinfo.leipzig.de/bi/oparl/1.0/locations.asp?id=32286", "type": "https://schema.oparl.org/1.0/Location", "description": "Friedrich-Ebert-Str. 19a, 04109 Leipzig", "street_address": "Friedrich-Ebert-Str. 19a", "postal_code": "04109", "subLocality": "", "locality": "Leipzig", }, "created": "2000-01-01T12:00:00+01:00", "modified": "2018-04-10T12:14:31+02:00", } [location, organization] = list(externalize(sample_organization)) self.assertEqual(organization.data["location"], location.data["id"])
def _process_element(self, element: JSON) -> List[str]: keys_of_interest = set() # type: Set[str] new = list(externalize(element, keys_of_interest)) # Find the ids of removed embedded objects # This way is not elegant, but it gets the job done. old_element = CachedObject.objects.filter(url=element["id"]).first() old_urls = set() if old_element: for key in keys_of_interest: if isinstance(old_element.data.get(key), list): old_urls.update(old_element.data[key]) elif isinstance(old_element.data.get(key), str): old_urls.add(old_element.data[key]) removed = old_urls - set([i.url for i in new]) fetch_later = CachedObject.objects.filter(url__in=removed).values_list( "url", flat=True) for instance in new: existing = CachedObject.objects.filter(url=instance.url).first() if existing: if existing.data == instance.data: continue else: existing.data = instance.data existing.to_import = True existing.save() else: instance.save() return fetch_later
def test_externalize_missing_id(caplog): """In http://buergerinfo.ulm.de/oparl/bodies/0001/meetings/11445, the embedded location does not have an id""" json_in = { "id": "http://buergerinfo.ulm.de/oparl/bodies/0001/meetings/11445", "type": "https://schema.oparl.org/1.1/Meeting", "name": "Klausurtagung des Gemeinderats", "start": "2021-06-12T09:00:00+02:00", "end": "2021-06-12T00:00:00+02:00", "location": { "description": "Ulm-Messe," }, "organization": ["http://buergerinfo.ulm.de/oparl/bodies/0001/organizations/gr/1"], "created": "2020-11-11T09:47:04+01:00", "modified": "2020-11-11T09:48:13+01:00", } # Check that location has been removed but everything else remained the same json_out = json_in.copy() del json_out["location"] externalized = externalize(json_in) assert len(externalized) == 1 assert externalized[0].data == json_out assert caplog.messages == [ "Embedded object at location in " "http://buergerinfo.ulm.de/oparl/bodies/0001/meetings/11445 does not have an " "id, skipping: {'description': 'Ulm-Messe,'}" ]
def import_anything(self, oparl_id: str) -> DefaultFields: """ Hacky metaprogramming to import any object based on its id """ logging.info("Importing single object {}".format(oparl_id)) to_return = None loaded = self.loader.load(oparl_id) # When a resourced moved, the use specified id might be different from the object's id # The loader prints a warning in that case oparl_id = loaded["id"] externalized = list(externalize(loaded)) # To avoid endless recursion, we sort the objects so that if A links to B then B gets imported first externalized.sort(key=lambda key: import_order.index( getattr(models, key.data["type"].split("/")[-1]))) for entry in externalized: instance = self.import_any_externalized(entry.data) defaults = { "url": entry.url, "data": entry.data, "oparl_type": entry.oparl_type, "to_import": False, } CachedObject.objects.update_or_create(url=entry.url, defaults=defaults) if entry.url == oparl_id: to_return = instance assert to_return, "Missing object for {}".format(oparl_id) return to_return
def fetch_list_initial(self, url: str) -> None: """Saves a complete external list as flattened json to the database""" logger.info(f"Fetching List {url}") timestamp = timezone.now() next_url = url all_objects = set() while next_url: logger.info(f"Fetching {next_url}") response = self.loader.load(next_url) objects = set() for element in response["data"]: externalized = externalize(element) for i in externalized: if not i.data.get("deleted") and i not in all_objects: objects.update(externalized) next_url = response["links"].get("next") # We can't have the that block outside the loop due to mysql's max_allowed_packet, manifesting # "MySQL server has gone away" https://stackoverflow.com/a/36637118/3549270 # We'll be able to solve this a lot better after the django 2.2 update with ignore_conflicts try: # Also avoid "MySQL server has gone away" errors due to timeouts # https://stackoverflow.com/a/32720475/3549270 db.close_old_connections() # The test are run with sqlite, which failed here with a TransactionManagementError: # "An error occurred in the current transaction. # You can't execute queries until the end of the 'atomic' block." # That's why we build our own atomic block if settings.TESTING: with transaction.atomic(): saved_objects = CachedObject.objects.bulk_create( objects) else: saved_objects = CachedObject.objects.bulk_create(objects) except IntegrityError: saved_objects = set() for i in objects: defaults = { "data": i.data, "to_import": True, "oparl_type": i.oparl_type, } saved_objects.add( CachedObject.objects.update_or_create( url=i.url, defaults=defaults)[0]) all_objects.update(saved_objects) logger.info(f"Found {len(all_objects)} objects in {url}") ExternalList(url=url, last_update=timestamp).save()
def import_anything( self, oparl_id: str, object_type: Optional[Type[T]] = None) -> DefaultFields: """Hacky metaprogramming to import any object based on its id""" logging.info("Importing single object {}".format(oparl_id)) to_return = None try: loaded = self.loader.load(oparl_id) except HTTPError as e: logger.error(f"Failed to load {oparl_id}: {e}") # This is a horrible workaround for broken oparl implementations # See test_missing.py if object_type and issubclass(object_type, DummyInterface): logger.error(f"Using a dummy for {oparl_id}. THIS IS BAD.") # noinspection PyTypeChecker dummy: T = object_type.dummy(oparl_id) dummy.save() return dummy else: raise # When a resourced moved, the use specified id might be different from the object's id # The loader prints a warning in that case oparl_id = loaded["id"] externalized = list(externalize(loaded)) # To avoid endless recursion, we sort the objects so that if A links to B then B gets imported first externalized.sort(key=lambda key: import_order.index( getattr(models, key.data["type"].split("/")[-1]))) for entry in externalized: instance = self.import_any_externalized(entry.data) defaults = { "url": entry.url, "data": entry.data, "oparl_type": entry.oparl_type, "to_import": False, } CachedObject.objects.update_or_create(url=entry.url, defaults=defaults) if entry.url == oparl_id: to_return = instance assert to_return, "Missing object for {}".format(oparl_id) return to_return
def setUpClass(cls): super().setUpClass() cls.api_data = {} cls.loader = MockLoader() cls.loader.api_data = cls.api_data for file in os.listdir(cls.dummy_data): if not file.endswith(".json"): continue with open(os.path.join(cls.dummy_data, file)) as fp: data = json.load(fp) cls.api_data[data["id"]] = data for entry in externalize(data): if entry.data["id"] not in cls.api_data: cls.api_data[entry.data["id"]] = entry.data # Used by test_location_default_body body = Body() body.short_name = "München" cls.converter = JsonToDb(cls.loader, default_body=body) cls.converter.warn_missing = False cls.utils = Utils()
def import_anything( self, oparl_id: str, object_type: Optional[Type[T]] = None ) -> DefaultFields: """Hacky metaprogramming to import any object based on its id""" logging.info(f"Importing single object {oparl_id}") try: loaded = self.loader.load(oparl_id) except HTTPError as e: logger.error( f"Failed to load {oparl_id}. Using a dummy instead. THIS IS BAD: {e}" ) return self._make_dummy(oparl_id, object_type) if not isinstance(loaded, dict): logger.error( f"JSON loaded from {oparl_id} is not a dict/object. Using a dummy instead. THIS IS BAD" ) return self._make_dummy(oparl_id, object_type) if "type" not in loaded: if object_type: loaded["type"] = "https://schema.oparl.org/1.0/" + object_type.__name__ logger.warning( f"Object loaded from {oparl_id} has no type field, inferred to {loaded['type']}" ) else: raise RuntimeError( f"The object {oparl_id} has not type field and object_type wasn't given" ) if "id" not in loaded: logger.warning( f"Object loaded from {oparl_id} has no id field, setting id to url" ) loaded["id"] = oparl_id oparl_id = loaded["id"] externalized = list(externalize(loaded)) # To avoid endless recursion, we sort the objects so that if A links to B then B gets imported first externalized.sort( key=lambda key: import_order.index( getattr(models, key.data["type"].split("/")[-1]) ) ) to_return = None for entry in externalized: instance = self.import_any_externalized(entry.data) defaults = { "url": entry.url, "data": entry.data, "oparl_type": entry.oparl_type, "to_import": False, } CachedObject.objects.update_or_create(url=entry.url, defaults=defaults) if entry.url == oparl_id: to_return = instance assert to_return, f"Missing object for {oparl_id}" return to_return