예제 #1
0
    def run_releases(self, release_stream: Sequence[str]) -> None:
        """
        Iterates over the stream of releases, which are expected to be grouped
        (sorted) by work_ident.

        Collects releases under same work_ident into a batch and processes a
        work from that.

        TODO: what is the right API here? stream iterator? how should
        parallelism work?
        """
        batch = []
        batch_work_id = None
        for line in release_stream:
            if not line:
                continue
            release = entity_from_json(line, ReleaseEntity)
            if release.work_id == batch_work_id:
                batch.append(release)
                continue
            if batch:
                ib = self.process_release_list(batch)
                print(ib.json(exclude_none=True, sort_keys=True))
                batch_work_id = None
            batch = [
                release,
            ]
            batch_work_id = release.work_id

        if batch:
            ib = self.process_release_list(batch)
            print(ib.json(exclude_none=True, sort_keys=True))
def test_es_biblio_from_release() -> None:

    with open("tests/files/release_hsmo6p4smrganpb3fndaj2lon4.json", "r") as f:
        release = entity_from_json(f.read(), ReleaseEntity)

    obj = es_biblio_from_release(release)
    d = json.loads(obj.json())

    assert (obj.release_ident == release.ident == d["release_ident"] ==
            "hsmo6p4smrganpb3fndaj2lon4")
def test_es_release_from_release() -> None:

    with open("tests/files/release_hsmo6p4smrganpb3fndaj2lon4.json", "r") as f:
        release = entity_from_json(f.read(), ReleaseEntity)

    obj = es_release_from_release(release)
    d = json.loads(obj.json())

    assert obj.ident == release.ident == d[
        "ident"] == "hsmo6p4smrganpb3fndaj2lon4"
    assert obj.doi_registrar == "crossref"
    assert obj.doi_prefix == "10.7717"
예제 #4
0
 def from_json(obj: Dict[Any, Any]) -> "IntermediateBundle":
     return IntermediateBundle(
         doc_type=DocType(obj.get("doc_type")),
         releases=[
             entity_from_json(json.dumps(re), ReleaseEntity)
             for re in obj.get("releases", [])
         ],
         biblio_release_ident=obj.get("biblio_release_ident"),
         crossref=obj.get("crossref"),
         grobid_fulltext=obj.get("grobid_fulltext"),
         pdftotext_fulltext=obj.get("pdftotext_fulltext"),
         pdf_meta=obj.get("pdf_meta"),
         sim_fulltext=obj.get("sim_fulltext"),
         html_fulltext=obj.get("html_fulltext"),
     )
예제 #5
0
    def process_batch(self, batch: List[dict]) -> None:

        bulk_actions = []
        for obj in batch:
            bundle = IntermediateBundle(
                doc_type=DocType(obj["doc_type"]),
                releases=[
                    entity_from_json(json.dumps(re), ReleaseEntity)
                    for re in obj["releases"]
                ],
                biblio_release_ident=obj.get("biblio_release_ident"),
                grobid_fulltext=obj.get("grobid_fulltext"),
                pdftotext_fulltext=obj.get("pdftotext_fulltext"),
                pdf_meta=obj.get("pdf_meta"),
                html_fulltext=obj.get("html_fulltext"),
                sim_fulltext=obj.get("sim_fulltext"),
            )
            es_doc = transform_heavy(bundle)
            if not es_doc:
                continue
            else:
                bulk_actions.append({
                    "_index":
                    self.es_index,
                    "_op_type":
                    "index",
                    "_id":
                    es_doc.key,
                    "_source":
                    es_doc.json(exclude_none=True, sort_keys=True),
                })
                self.counts["docs-indexed"] += 1

        if not bulk_actions:
            return

        elasticsearch.helpers.bulk(self.es_client, bulk_actions, timeout="30s")
        self.counts["batches-indexed"] += 1