def run_releases(self, release_stream: Sequence[str]) -> None: """ Iterates over the stream of releases, which are expected to be grouped (sorted) by work_ident. Collects releases under same work_ident into a batch and processes a work from that. TODO: what is the right API here? stream iterator? how should parallelism work? """ batch = [] batch_work_id = None for line in release_stream: if not line: continue release = entity_from_json(line, ReleaseEntity) if release.work_id == batch_work_id: batch.append(release) continue if batch: ib = self.process_release_list(batch) print(ib.json(exclude_none=True, sort_keys=True)) batch_work_id = None batch = [ release, ] batch_work_id = release.work_id if batch: ib = self.process_release_list(batch) print(ib.json(exclude_none=True, sort_keys=True))
def test_es_biblio_from_release() -> None: with open("tests/files/release_hsmo6p4smrganpb3fndaj2lon4.json", "r") as f: release = entity_from_json(f.read(), ReleaseEntity) obj = es_biblio_from_release(release) d = json.loads(obj.json()) assert (obj.release_ident == release.ident == d["release_ident"] == "hsmo6p4smrganpb3fndaj2lon4")
def test_es_release_from_release() -> None: with open("tests/files/release_hsmo6p4smrganpb3fndaj2lon4.json", "r") as f: release = entity_from_json(f.read(), ReleaseEntity) obj = es_release_from_release(release) d = json.loads(obj.json()) assert obj.ident == release.ident == d[ "ident"] == "hsmo6p4smrganpb3fndaj2lon4" assert obj.doi_registrar == "crossref" assert obj.doi_prefix == "10.7717"
def from_json(obj: Dict[Any, Any]) -> "IntermediateBundle": return IntermediateBundle( doc_type=DocType(obj.get("doc_type")), releases=[ entity_from_json(json.dumps(re), ReleaseEntity) for re in obj.get("releases", []) ], biblio_release_ident=obj.get("biblio_release_ident"), crossref=obj.get("crossref"), grobid_fulltext=obj.get("grobid_fulltext"), pdftotext_fulltext=obj.get("pdftotext_fulltext"), pdf_meta=obj.get("pdf_meta"), sim_fulltext=obj.get("sim_fulltext"), html_fulltext=obj.get("html_fulltext"), )
def process_batch(self, batch: List[dict]) -> None: bulk_actions = [] for obj in batch: bundle = IntermediateBundle( doc_type=DocType(obj["doc_type"]), releases=[ entity_from_json(json.dumps(re), ReleaseEntity) for re in obj["releases"] ], biblio_release_ident=obj.get("biblio_release_ident"), grobid_fulltext=obj.get("grobid_fulltext"), pdftotext_fulltext=obj.get("pdftotext_fulltext"), pdf_meta=obj.get("pdf_meta"), html_fulltext=obj.get("html_fulltext"), sim_fulltext=obj.get("sim_fulltext"), ) es_doc = transform_heavy(bundle) if not es_doc: continue else: bulk_actions.append({ "_index": self.es_index, "_op_type": "index", "_id": es_doc.key, "_source": es_doc.json(exclude_none=True, sort_keys=True), }) self.counts["docs-indexed"] += 1 if not bulk_actions: return elasticsearch.helpers.bulk(self.es_client, bulk_actions, timeout="30s") self.counts["batches-indexed"] += 1