class PersistGrobidRefsWorker(SandcrawlerWorker): """ Simple persist worker to backfill GROBID references in to postgresql locally. Consumes the JSON output from GROBID CrossrefRefsWorker. """ def __init__(self, db_url: str, **kwargs): super().__init__(**kwargs) self.db = SandcrawlerPostgresClient(db_url) self.cur = self.db.conn.cursor() def process(self, record: Any, key: Optional[str] = None) -> Any: """Only do batches (as transactions)""" raise NotImplementedError def push_batch(self, batch: list) -> list: self.counts["total"] += len(batch) refs_batch = [] for record in batch: assert record["source"] assert record["source_id"] refs_batch.append(record) resp = self.db.insert_grobid_refs(self.cur, refs_batch) if len(refs_batch) < len(batch): self.counts["skip"] += len(batch) - len(refs_batch) self.counts["insert-grobid_refs"] += resp[0] self.counts["update-grobid_refs"] += resp[1] self.db.commit() return []
class PersistCrossrefWorker(SandcrawlerWorker): """ Pushes Crossref API JSON records into postgresql. Can also talk to GROBID, parsed 'unstructured' references, and push the results in to postgresql at the same time. """ def __init__(self, db_url: str, grobid_client: Optional[GrobidClient], parse_refs: bool = True, **kwargs): super().__init__(**kwargs) self.db = SandcrawlerPostgresClient(db_url) self.cur = self.db.conn.cursor() if grobid_client: self.grobid_client = grobid_client else: self.grobid_client = GrobidClient() self.parse_refs = parse_refs def process(self, record: Any, key: Optional[str] = None) -> Any: """Only do batches (as transactions)""" raise NotImplementedError def push_batch(self, batch: list) -> list: self.counts["total"] += len(batch) crossref_batch = [] refs_batch = [] for record in batch: crossref_batch.append( dict( doi=record["DOI"].lower().strip(), indexed=record["indexed"]["date-time"], record=record, )) if self.parse_refs: try: parsed_refs = self.grobid_client.crossref_refs(record) refs_batch.append(parsed_refs) except ( xml.etree.ElementTree.ParseError, requests.exceptions.HTTPError, requests.exceptions.ReadTimeout, ): print( "GROBID crossref refs parsing error, skipping with a sleep" ) time.sleep(3) pass resp = self.db.insert_crossref(self.cur, crossref_batch) if len(crossref_batch) < len(batch): self.counts["skip"] += len(batch) - len(crossref_batch) self.counts["insert-crossref"] += resp[0] self.counts["update-crossref"] += resp[1] if refs_batch: resp = self.db.insert_grobid_refs(self.cur, refs_batch) if len(refs_batch) < len(batch): self.counts["skip"] += len(batch) - len(refs_batch) self.counts["insert-grobid_refs"] += resp[0] self.counts["update-grobid_refs"] += resp[1] self.db.commit() return []