def test_elasticsearch_release_kbart_year(): this_year = datetime.date.today().year r = ReleaseEntity( title="something", release_year=this_year, license_slug="CC-BY-NC", ext_ids=ReleaseExtIds(), refs=[ ReleaseRef(), ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"), ], ) r.state = "active" r.container = ContainerEntity( name="dummy journal", extra={ "kbart": { "lockss": { "year_spans": [[1900, this_year - 2]], }, }, }, ) es = release_to_elasticsearch(r) assert es["release_year"] == this_year assert es["preservation"] == "none" assert es["is_oa"] is True assert es["is_longtail_oa"] is False assert es["is_preserved"] is False assert es["in_web"] is False assert es["in_dweb"] is False assert es["in_ia"] is False assert es["in_ia_sim"] is False assert es["in_kbart"] is False assert es["in_jstor"] is False r.container = ContainerEntity( name="dummy journal", extra={ "kbart": { "lockss": { "year_spans": [[1900, this_year - 1]], }, }, }, ) es = release_to_elasticsearch(r) assert es["release_year"] == this_year assert es["preservation"] == "dark" assert es["is_oa"] is True assert es["is_longtail_oa"] is False assert es["is_preserved"] is True assert es["in_web"] is False assert es["in_dweb"] is False assert es["in_ia"] is False assert es["in_ia_sim"] is False assert es["in_kbart"] is True assert es["in_jstor"] is False
def update_entity(self, ce: ContainerEntity) -> None: """ Mutates a container entity in place, updating fields with values from this form. Form must be validated *before* calling this function. """ for simple_attr in CONTAINER_SIMPLE_ATTRS: a = getattr(self, simple_attr).data # special case blank strings if a == "": a = None setattr(ce, simple_attr, a) if not ce.extra: ce.extra = dict() for extra_attr in CONTAINER_EXTRA_ATTRS: a = getattr(self, extra_attr).data if a and a != "": ce.extra[extra_attr] = a extra_urls = [] for url in self.urls: extra_urls.append(url.data) if extra_urls: ce.extra["urls"] = extra_urls if self.edit_description.data: ce.edit_extra = dict(description=self.edit_description.data) if not ce.extra: ce.extra = None
def parse_record(self, row: Dict[str, Any]) -> Optional[ContainerEntity]: """ row is a python dict (parsed from JSON). returns a ContainerEntity (or None if invalid or couldn't parse) """ name = clean_str(row.get("name")) if name and name.endswith("."): name = name[:-1] if not name: # Name is required (by schema) return None extra = dict() for k in ( "urls", "webarchive_urls", "country", "sherpa_romeo", "ezb", "szczepanski", "doaj", "languages", "ia", "scielo", "kbart", "publisher_type", "platform", ): if row["extra"].get(k): extra[k] = row["extra"][k] container_type = None if "proceedings" in name.lower(): container_type = "proceedings" elif "journal " in name.lower(): container_type = "journal" if row["extra"].get("issnp"): row["extra"]["issnp"] = row["extra"]["issnp"].upper() if row["extra"].get("issne"): row["extra"]["issne"] = row["extra"]["issne"].upper() ce = ContainerEntity( issnl=row["issnl"], issnp=row["extra"].get("issnp"), issne=row["extra"].get("issne"), ident=row["ident"], name=name, container_type=container_type, publisher=clean_str(row.get("publisher")), wikidata_qid=row.get("wikidata_qid"), extra=extra, ) return ce
def generic_deleted_entity(entity_type: str, ident: str) -> Any: if entity_type == "container": entity: Any = ContainerEntity() elif entity_type == "creator": entity = CreatorEntity() elif entity_type == "file": entity = FileEntity() elif entity_type == "fileset": entity = FilesetEntity() elif entity_type == "webcapture": entity = WebcaptureEntity() elif entity_type == "release": entity = ReleaseEntity(ext_ids=ReleaseExtIds()) elif entity_type == "work": entity = WorkEntity() else: raise NotImplementedError entity.ident = ident entity.state = "deleted" return entity
def to_entity(self) -> ContainerEntity: assert self.name.data entity = ContainerEntity(name=self.name.data) self.update_entity(entity) return entity
def test_rich_elasticsearch_convert(): r = ReleaseEntity( title="something", release_year=1234, license_slug="CC-BY-NC", ext_ids=ReleaseExtIds(), refs=[ ReleaseRef(), ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"), ], ) r.state = "active" r.container = ContainerEntity( name="dummy journal", extra={ "ia": { "sim": { "year_spans": [[1000, 1100]], }, }, "kbart": { "lockss": { "year_spans": [[1200, 1300]], }, "jstor": { "year_spans": [[1000, 1300], [1950, 1960], [1980, 2005]], }, }, "sherpa_romeo": { "color": "blue" }, "doaj": { "as_of": "2010-02-03" }, }, ) r.files = [ FileEntity( mimetype="application/pdf", urls=[ FileUrl(rel="dweb", url="dat://a954329dlk/thingie"), FileUrl( rel="webarchive", url= "https://web.archive.org/web/20001122030405/http://example.com", ), FileUrl(rel="web", url="https://archive.org/details/blah/file.pdf"), ], extra={ "shadows": {}, }, ) ] es = release_to_elasticsearch(r) assert es["release_year"] == r.release_year assert es["file_count"] == 1 assert es["fileset_count"] == 0 assert es["webcapture_count"] == 0 assert es["ref_count"] == 2 assert es["ref_linked_count"] == 1 assert es["preservation"] == "bright" assert es["is_oa"] is True assert es["is_longtail_oa"] is False assert es["is_preserved"] is True assert es["in_web"] is True assert es["in_dweb"] is True assert es["in_ia"] is True assert es["in_ia_sim"] is False assert es["in_kbart"] is True assert es["in_jstor"] is True
def parse_record(self, row: Dict[str, Any]) -> Optional[ContainerEntity]: """ row is a python dict (parsed from JSON). returns a ContainerEntity (or None if invalid or couldn't parse) """ if not row.get("name"): # Name is required (by schema) return None extra = dict() for key in ( "issne", "issnp", "languages", "country", "urls", "abbrev", "coden", "aliases", "original_name", "first_year", "last_year", "platform", "default_license", "road", "mimetypes", "sherpa_romeo", "kbart", ): if row.get(key): extra[key] = row[key] # TODO: not including for now: norwegian, dois/crossref, ia extra_doaj = dict() if row.get("doaj"): if row["doaj"].get("as_of"): extra_doaj["as_of"] = row["doaj"]["as_of"] if row["doaj"].get("works"): extra_doaj["works"] = row["doaj"]["works"] if extra_doaj: extra["doaj"] = extra_doaj extra_ia = dict() # TODO: would like an ia.longtail_ia flag if row.get("sim"): # NB: None case of the .get() here is blech, but othrwise # extra['ia'].get('sim') would be false-y, breaking 'any_ia_sim' later on extra_ia["sim"] = { "year_spans": row["sim"].get("year_spans"), } if extra_ia: extra["ia"] = extra_ia name = clean_str(row.get("name")) if not name: return None ce = ContainerEntity( issnl=row["issnl"], issne=row.get("issne"), issnp=row.get("issnp"), container_type=None, # TODO name=name, publisher=clean_str(row.get("publisher")), wikidata_qid=None, # TODO extra=extra, ) return ce
def enrich_container_entity(entity: ContainerEntity) -> ContainerEntity: if entity.state in ("redirect", "deleted"): return entity if entity.state == "active": entity._es = container_to_elasticsearch(entity, force_bool=False) return entity
def test_choose_primary_container(api) -> None: release_counts = dict() redirects = dict() em = ContainerMerger(api=api) ce_stub = ContainerEntity( ident="pppppp5apzfhbbxxc7rgu2yw6m", name="dummy journal", ) release_counts[ce_stub.ident] = 0 redirects[ce_stub.ident] = [] ce_partial = ContainerEntity( ident="eeeeeeeapzfhbbxxc7rgu2yw6m", name="dummy complete journal", publisher="some publisher", issnl="1234-5678", publication_status="active", extra=dict(asdf=123, ia=dict(asdf=True)), ) release_counts[ce_partial.ident] = 0 redirects[ce_partial.ident] = [] ce_partial_redirects = ContainerEntity( ident="rrrrrrrrrrfhbbxxc7rgu2yw6m", name="dummy complete journal", publisher="some publisher", issnl="1234-5678", publication_status="active", extra=dict(asdf=123, ia=dict(asdf=True)), ) release_counts[ce_partial_redirects.ident] = 0 redirects[ce_partial_redirects.ident] = [ "zzzzzzzzrrfhbbxxc7rgu2yw6m", ] ce_complete_zero = ContainerEntity( ident="oooooooapzfhbbxxc7rgu2yw6m", name="dummy complete journal", publisher="some publisher", issnl="1234-5678", publication_status="active", extra=dict(asdf=123, ia=dict(asdf=True)), ) release_counts[ce_complete_zero.ident] = 0 redirects[ce_complete_zero.ident] = [] ce_complete_small = ContainerEntity( ident="cccccccapzfhbbxxc7rgu2yw6m", name="dummy complete journal", publisher="some publisher", issnl="1234-5678", publication_status="active", extra=dict(asdf=123, ia=dict(asdf=True)), ) release_counts[ce_complete_small.ident] = 10 redirects[ce_complete_small.ident] = [] ce_complete_big = ContainerEntity( ident="ddddddddpzfhbbxxc7rgu2yw6m", name="dummy complete journal", publisher="some publisher", issnl="1234-5678", publication_status="active", extra=dict(asdf=123, ia=dict(asdf=True)), ) release_counts[ce_complete_big.ident] = 9999999 redirects[ce_complete_big.ident] = [] assert (em.choose_primary_container([ce_stub, ce_partial], redirects, release_counts) == ce_partial.ident) assert (em.choose_primary_container( [ce_stub, ce_complete_zero, ce_partial], redirects, release_counts) == ce_complete_zero.ident) assert (em.choose_primary_container( [ce_stub, ce_partial_redirects, ce_complete_zero, ce_partial], redirects, release_counts, ) == ce_partial_redirects.ident) assert (em.choose_primary_container( [ce_stub, ce_complete_zero, ce_complete_small, ce_partial], redirects, release_counts, ) == ce_complete_small.ident) assert (em.choose_primary_container( [ ce_stub, ce_complete_big, ce_complete_zero, ce_complete_small, ce_partial ], redirects, release_counts, ) == ce_complete_big.ident) assert (em.choose_primary_container( [ce_complete_small, ce_complete_big], redirects, release_counts) == ce_complete_big.ident)