class PersistGrobidRefsWorker(SandcrawlerWorker): """ Simple persist worker to backfill GROBID references in to postgresql locally. Consumes the JSON output from GROBID CrossrefRefsWorker. """ def __init__(self, db_url: str, **kwargs): super().__init__(**kwargs) self.db = SandcrawlerPostgresClient(db_url) self.cur = self.db.conn.cursor() def process(self, record: Any, key: Optional[str] = None) -> Any: """Only do batches (as transactions)""" raise NotImplementedError def push_batch(self, batch: list) -> list: self.counts["total"] += len(batch) refs_batch = [] for record in batch: assert record["source"] assert record["source_id"] refs_batch.append(record) resp = self.db.insert_grobid_refs(self.cur, refs_batch) if len(refs_batch) < len(batch): self.counts["skip"] += len(batch) - len(refs_batch) self.counts["insert-grobid_refs"] += resp[0] self.counts["update-grobid_refs"] += resp[1] self.db.commit() return []
def __init__(self, db_url: str, grobid_client: Optional[GrobidClient], parse_refs: bool = True, **kwargs): super().__init__(**kwargs) self.db = SandcrawlerPostgresClient(db_url) self.cur = self.db.conn.cursor() if grobid_client: self.grobid_client = grobid_client else: self.grobid_client = GrobidClient() self.parse_refs = parse_refs
class PersistCdxWorker(SandcrawlerWorker): def __init__(self, db_url: str, **kwargs): super().__init__() self.db = SandcrawlerPostgresClient(db_url) self.cur = self.db.conn.cursor() def process(self, record: Any, key: Optional[str] = None) -> Any: """Only do batches (as transactions)""" raise NotImplementedError def push_batch(self, batch: list) -> list: self.counts["total"] += len(batch) # filter to full CDX lines, no liveweb cdx_batch = [ r for r in batch if r.get("warc_path") and ("/" in r["warc_path"]) ] resp = self.db.insert_cdx(self.cur, cdx_batch) if len(cdx_batch) < len(batch): self.counts["skip"] += len(batch) - len(cdx_batch) self.counts["insert-cdx"] += resp[0] self.counts["update-cdx"] += resp[1] self.db.commit() return []
class PersistPdfTrioWorker(SandcrawlerWorker): def __init__(self, db_url: str, **kwargs): super().__init__() self.db = SandcrawlerPostgresClient(db_url) self.cur = self.db.conn.cursor() def process(self, record: Any, key: Optional[str] = None) -> Any: """Only do batches (as transactions)""" raise NotImplementedError def push_batch(self, batch: list) -> list: self.counts["total"] += len(batch) batch = [ r for r in batch if "pdf_trio" in r and r["pdf_trio"].get("status_code") ] for r in batch: # copy key (sha1hex) into sub-object r["pdf_trio"]["key"] = r["key"] pdftrio_batch = [r["pdf_trio"] for r in batch] resp = self.db.insert_pdftrio(self.cur, pdftrio_batch, on_conflict="update") self.counts["insert-pdftrio"] += resp[0] self.counts["update-pdftrio"] += resp[1] file_meta_batch = [ r["file_meta"] for r in batch if r["pdf_trio"]["status"] == "success" and r.get("file_meta") ] resp = self.db.insert_file_meta(self.cur, file_meta_batch) self.counts["insert-file-meta"] += resp[0] self.counts["update-file-meta"] += resp[1] self.db.commit() return []
def __init__(self, db_url: str, **kwargs): super().__init__() self.s3 = SandcrawlerMinioClient( host_url=kwargs.get("s3_url", "localhost:9000"), access_key=kwargs["s3_access_key"], secret_key=kwargs["s3_secret_key"], default_bucket=kwargs["s3_bucket"], ) self.s3_only = kwargs.get("s3_only", False) self.db_only = kwargs.get("db_only", False) assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed" if not self.s3_only: self.db: Optional[ SandcrawlerPostgresClient] = SandcrawlerPostgresClient(db_url) self.cur: Optional[ psycopg2.extensions.cursor] = self.db.conn.cursor() else: self.db = None self.cur = None
def __init__(self, db_url: str, **kwargs): super().__init__(**kwargs) self.db = SandcrawlerPostgresClient(db_url) self.cur = self.db.conn.cursor()
class PersistCrossrefWorker(SandcrawlerWorker): """ Pushes Crossref API JSON records into postgresql. Can also talk to GROBID, parsed 'unstructured' references, and push the results in to postgresql at the same time. """ def __init__(self, db_url: str, grobid_client: Optional[GrobidClient], parse_refs: bool = True, **kwargs): super().__init__(**kwargs) self.db = SandcrawlerPostgresClient(db_url) self.cur = self.db.conn.cursor() if grobid_client: self.grobid_client = grobid_client else: self.grobid_client = GrobidClient() self.parse_refs = parse_refs def process(self, record: Any, key: Optional[str] = None) -> Any: """Only do batches (as transactions)""" raise NotImplementedError def push_batch(self, batch: list) -> list: self.counts["total"] += len(batch) crossref_batch = [] refs_batch = [] for record in batch: crossref_batch.append( dict( doi=record["DOI"].lower().strip(), indexed=record["indexed"]["date-time"], record=record, )) if self.parse_refs: try: parsed_refs = self.grobid_client.crossref_refs(record) refs_batch.append(parsed_refs) except ( xml.etree.ElementTree.ParseError, requests.exceptions.HTTPError, requests.exceptions.ReadTimeout, ): print( "GROBID crossref refs parsing error, skipping with a sleep" ) time.sleep(3) pass resp = self.db.insert_crossref(self.cur, crossref_batch) if len(crossref_batch) < len(batch): self.counts["skip"] += len(batch) - len(crossref_batch) self.counts["insert-crossref"] += resp[0] self.counts["update-crossref"] += resp[1] if refs_batch: resp = self.db.insert_grobid_refs(self.cur, refs_batch) if len(refs_batch) < len(batch): self.counts["skip"] += len(batch) - len(refs_batch) self.counts["insert-grobid_refs"] += resp[0] self.counts["update-grobid_refs"] += resp[1] self.db.commit() return []
class PersistIngestFileResultWorker(SandcrawlerWorker): def __init__(self, db_url: str, **kwargs): super().__init__() self.db = SandcrawlerPostgresClient(db_url) self.cur = self.db.conn.cursor() def process(self, record: Any, key: Optional[str] = None) -> Any: """Only do batches (as transactions)""" raise NotImplementedError def request_to_row(self, raw: Dict[str, Any]) -> Optional[Dict[str, Any]]: """ Converts ingest-request JSON schema (eg, from Kafka) to SQL ingest_request schema if there is a problem with conversion, return None """ # backwards compat hacks; transform request to look like current schema if raw.get("ingest_type") == "file": raw["ingest_type"] = "pdf" if (not raw.get("link_source") and raw.get("base_url") and raw.get("ext_ids", {}).get("doi") and raw["base_url"] == "https://doi.org/{}".format(raw["ext_ids"]["doi"])): # set link_source(_id) for old ingest requests raw["link_source"] = "doi" raw["link_source_id"] = raw["ext_ids"]["doi"] if (not raw.get("link_source") and raw.get( "ingest_request_source", "").startswith("savepapernow") and raw.get("fatcat", {}).get("release_ident")): # set link_source(_id) for old ingest requests raw["link_source"] = "spn" raw["link_source_id"] = raw["fatcat"]["release_ident"] for k in ("ingest_type", "base_url", "link_source", "link_source_id"): if k not in raw: self.counts["skip-request-fields"] += 1 return None if raw["ingest_type"] not in ("pdf", "xml", "html"): self.counts["skip-ingest-type"] += 1 return None request = { "ingest_type": raw["ingest_type"], "base_url": raw["base_url"], "link_source": raw["link_source"], "link_source_id": raw["link_source_id"], "ingest_request_source": raw.get("ingest_request_source"), "request": {}, } # extra/optional fields if raw.get("release_stage"): request["release_stage"] = raw["release_stage"] if raw.get("fatcat", {}).get("release_ident"): request["request"]["release_ident"] = raw["fatcat"][ "release_ident"] for k in ("ext_ids", "edit_extra", "rel"): if raw.get(k): request["request"][k] = raw[k] # if this dict is empty, trim it to save DB space if not request["request"]: request["request"] = None return request def file_result_to_row(self, raw: dict) -> Optional[dict]: """ Converts ingest-result JSON schema (eg, from Kafka) to SQL ingest_file_result schema if there is a problem with conversion, return None and set skip count """ for k in ("request", "hit", "status"): if k not in raw: self.counts["skip-result-fields"] += 1 return None if "base_url" not in raw["request"]: self.counts["skip-result-fields"] += 1 return None ingest_type = raw["request"].get("ingest_type") if ingest_type == "file": ingest_type = "pdf" if ingest_type not in ( "pdf", "xml", "html", "component", "src", "dataset", "dataset-file", ): self.counts["skip-ingest-type"] += 1 return None if raw["status"] in ("existing", ): self.counts["skip-existing"] += 1 return None result = { "ingest_type": ingest_type, "base_url": raw["request"]["base_url"], "hit": raw["hit"], "status": raw["status"], } terminal = raw.get("terminal") if terminal: result["terminal_url"] = terminal.get( "terminal_url") or terminal.get("url") result["terminal_dt"] = terminal.get("terminal_dt") result["terminal_status_code"] = ( terminal.get("terminal_status_code") or terminal.get("status_code") or terminal.get("http_code")) if result["terminal_status_code"]: result["terminal_status_code"] = int( result["terminal_status_code"]) result["terminal_sha1hex"] = terminal.get("terminal_sha1hex") if len(result["terminal_url"]) > 2048: # postgresql13 doesn't like extremely large URLs in b-tree index self.counts["skip-huge-url"] += 1 return None return result def result_to_html_meta(self, record: dict) -> Optional[HtmlMetaRow]: html_body = record.get("html_body") file_meta = record.get("file_meta") if not (file_meta and html_body): return None return HtmlMetaRow( sha1hex=file_meta["sha1hex"], status=record.get("status"), scope=record.get("scope"), has_teixml=bool(html_body and html_body["status"] == "success"), has_thumbnail=False, # TODO word_count=(html_body and html_body.get("word_count")) or None, biblio=record.get("html_biblio"), resources=record.get("html_resources"), ) def result_to_platform_row(self, raw: dict) -> Optional[dict]: """ Converts fileset ingest-result JSON schema (eg, from Kafka) to SQL ingest_fileset_platform schema if there is a problem with conversion, return None and set skip count """ for k in ("request", "hit", "status"): if k not in raw: return None if "base_url" not in raw["request"]: return None ingest_type = raw["request"].get("ingest_type") if ingest_type not in ("dataset"): return None if raw["status"] in ("existing", ): return None if not raw.get("platform_name"): return None result = { "ingest_type": ingest_type, "base_url": raw["request"]["base_url"], "hit": raw["hit"], "status": raw["status"], "platform_name": raw.get("platform_name"), "platform_domain": raw.get("platform_domain"), "platform_id": raw.get("platform_id"), "ingest_strategy": raw.get("ingest_strategy"), "total_size": raw.get("total_size"), "file_count": raw.get("file_count"), "archiveorg_item_name": raw.get("archiveorg_item_name"), "archiveorg_item_bundle_path": None, "web_bundle_url": None, "web_bundle_dt": None, "manifest": raw.get("manifest"), } if result.get("fileset_bundle"): result["archiveorg_item_bundle_path"] = result[ "fileset_bundle"].get("archiveorg_item_bundle_path") result["web_bundle_url"] = (result["fileset_bundle"].get( "terminal", {}).get("terminal_url")) result["web_bundle_dt"] = (result["fileset_bundle"].get( "terminal", {}).get("terminal_dt")) return result def push_batch(self, batch: List[Any]) -> List[Any]: self.counts["total"] += len(batch) if not batch: return [] results_unfiltered = [self.file_result_to_row(raw) for raw in batch] results = [r for r in results_unfiltered if r] irequests_unfiltered = [ self.request_to_row(raw["request"]) for raw in batch if raw.get("request") ] irequests = [ r for r in irequests_unfiltered if r and r["ingest_type"] != "dataset-file" ] if irequests: resp = self.db.insert_ingest_request(self.cur, irequests) self.counts["insert-requests"] += resp[0] self.counts["update-requests"] += resp[1] if results: resp = self.db.insert_ingest_file_result(self.cur, results, on_conflict="update") self.counts["insert-results"] += resp[0] self.counts["update-results"] += resp[1] # these schemas match, so can just pass through cdx_batch = [r["cdx"] for r in batch if r.get("hit") and r.get("cdx")] revisit_cdx_batch = [ r["revisit_cdx"] for r in batch if r.get("hit") and r.get("revisit_cdx") ] cdx_batch.extend(revisit_cdx_batch) # filter to full CDX lines, with full warc_paths (not liveweb) cdx_batch = [ r for r in cdx_batch if r.get("warc_path") and ("/" in r["warc_path"]) ] if cdx_batch: resp = self.db.insert_cdx(self.cur, cdx_batch) self.counts["insert-cdx"] += resp[0] self.counts["update-cdx"] += resp[1] file_meta_batch = [ r["file_meta"] for r in batch if r.get("hit") and r.get("file_meta") ] if file_meta_batch: resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="nothing") self.counts["insert-file_meta"] += resp[0] self.counts["update-file_meta"] += resp[1] html_meta_batch = [ self.result_to_html_meta(r) for r in batch if r.get("hit") and r.get("html_body") ] if html_meta_batch: rows = [d.to_sql_tuple() for d in html_meta_batch if d] resp = self.db.insert_html_meta(self.cur, rows, on_conflict="update") self.counts["insert-html_meta"] += resp[0] self.counts["update-html_meta"] += resp[1] fileset_platform_batch_all = [ self.result_to_platform_row(raw) for raw in batch if raw.get("request", {}).get("ingest_type") == "dataset" and raw.get("platform_name") ] fileset_platform_batch: List[Dict] = [ p for p in fileset_platform_batch_all if p ] if fileset_platform_batch: resp = self.db.insert_ingest_fileset_platform( self.cur, fileset_platform_batch, on_conflict="update") self.counts["insert-fileset_platform"] += resp[0] self.counts["update-fileset_platform"] += resp[1] self.db.commit() return []