def backup_db(c: CoreIngestConfig) -> None: if c.skip_db_backup: announce("Skipping DB backup ...") else: announce("Backing up DB(s) ...") c.core_db_manager.backup_all_tables_for_all_dbs( ts=c.batch_timestamp, job_dir=c.db_backup_dir)
def update_es(c: CloneIngestConfig) -> None: announce(f"Creating/Updating ES index: {c.index_name} ...") c.es_publisher.create_index() c.es_publisher.index_jsons() if c.alias_name: announce(f"Setting ES index('{c.index_name}') to alias('{c.alias_name}') ...") c.es_publisher.update_alias()
def delete_from_db(c: CoreIngestConfig) -> None: if c.skip_db_update: announce("Skip DB removal ...") return announce("Removing docs from DB ...") remove_docs_from_db(lm=c.load_manager, removal_list=c.db_tuple_list)
def update_thumbnails(c: CoreIngestConfig) -> None: """Uploads the .png thumbnails of the raw documents to the appropriate s3 prefix""" if c.skip_thumbnail_generation: announce("Skipping Thumbnails update [flag set] ...") return announce("Updating thumbnails ...") c.thumbnail_job_manager.process_directory()
def delete_from_neo4j(c: CoreIngestConfig) -> None: if c.skip_neo4j_update: announce("Skip Neo4j removal ...") return announce("Removing docs from Neo4j ...") remove_docs_from_neo4j(njm=c.neo4j_job_manager, removal_list=c.removal_list)
def update_thumbnails(c: CloneIngestConfig) -> None: if c.skip_thumbnail_generation: announce("Skipping Thumbnails update [flag set] ...") return announce("Updating thumbnails ...") c.thumbnail_job_manager.process_directory()
def update_crawler_status_completed(c: CloneIngestConfig) -> None: if not c.crawler_output: announce("Skipping crawler_status table update [no crawler output file provided] ...") return announce("Updating crawler status table to Ingest Complete...") c.crawler_status_tracker.update_crawler_status(status="Ingest Complete", timestamp=datetime.now(), update_db=not c.skip_db_update)
def update_crawler_status_downloaded(c: CloneIngestConfig) -> None: if not c.crawler_output: announce("Skipping crawler_status table update [no crawler output file provided] ...") return announce("Updating crawler status table to Crawl and Download Complete...") c.crawler_status_tracker.update_crawler_status(status="Crawl and Download Complete", timestamp=c.batch_timestamp, update_db=not c.skip_db_update)
def parse_and_ocr(c: CloneIngestConfig) -> None: announce(f"Parsing and OCR'ing docs from '{c.raw_doc_base_dir}' ...") pdf_to_json( parser_path="common.document_parser.parsers.policy_analytics.parse::parse", source=str(c.raw_doc_base_dir), destination=str(c.parsed_doc_base_dir), metadata=str(c.raw_doc_base_dir), ocr_missing_doc=True, multiprocess=c.max_threads, num_ocr_threads=c.max_ocr_threads )
def update_neo4j(c: CoreIngestConfig) -> None: if c.skip_neo4j_update: announce("Skipping Neo4J update ...") else: announce("Updating Neo4J ...") c.neo4j_job_manager.run_update(source=c.parsed_doc_base_dir, clear=False, max_threads=c.max_threads_neo4j, scrape_wiki=False, without_web_scraping=True, infobox_dir=c.infobox_dir)
def update_revocations(c: CloneIngestConfig) -> None: if c.skip_revocation_update: announce("Skipping Revocations update [flag set] ...") return announce("Updating revocations ...") c.crawler_status_tracker.handle_revocations(index_name=c.index_name, update_db=not c.skip_db_update, update_es=not c.skip_es_revocation, update_neo4j=not c.skip_neo4j_update)
def load_files(c: CoreIngestConfig) -> None: """Runs the load function, updating the publications table, the versioned_docs table, and s3. Docs are only updated in versioned_docs if they are also uploaded to s3""" announce("Loading files into S3 & DB ...") c.load_manager.load(raw_dir=c.raw_doc_base_dir, metadata_dir=c.raw_doc_base_dir, parsed_dir=c.parsed_doc_base_dir, ingest_ts=c.batch_timestamp, update_s3=True, max_threads=c.max_s3_threads, update_db=not c.skip_db_update, thumbnail_dir=c.thumbnail_doc_base_dir)
def load_files(c: CloneIngestConfig) -> None: announce("Loading files into S3 & DB ...") c.load_manager.load( raw_dir=c.raw_doc_base_dir, metadata_dir=c.raw_doc_base_dir, parsed_dir=c.parsed_doc_base_dir, ingest_ts=c.batch_timestamp, update_s3=True, max_threads=c.max_s3_threads, update_db=not c.skip_db_update, thumbnail_dir=c.thumbnail_doc_base_dir )
def core_checkpoint_ingest(core_ingest_config: CoreIngestConfig, **kwargs): """Pipeline for parsing focs from checkpointed s3 prefixes""" cig = CheckpointIngestConfig.from_core_config( core_config=core_ingest_config, other_config_kwargs=kwargs) announce("Aggregating files for processing ...") announce(f"Aggregating files from checkpoints ...") last_prefix: t.Optional[TimestampedPrefix] = None with cig.checkpoint_manager.checkpoint_download_manager( base_download_dir=cig.download_base_dir, advance_checkpoint=cig.advance_checkpoint, limit=cig.checkpoint_limit if cig.checkpoint_limit > 0 else None, max_threads=cig.max_threads) as downloaded_prefixes: for dp in downloaded_prefixes: last_prefix = dp.timestamped_prefix for f in (p for p in dp.local_path.iterdir() if p.is_file()): shutil.copy(str(f), str(Path(cig.raw_doc_base_dir, f.name))) if not last_prefix: announce("There was nothing to do, skipping remainder of ingest ...") exit(0) if not next( (p for p in cig.raw_doc_base_dir.iterdir() if p.is_file()), None): announce( "[WARNING] No files were downloaded for processing, exiting pipeline." ) exit(1) CoreIngestSteps.update_crawler_status_downloaded(cig) CoreIngestSteps.update_crawler_status_in_progress(cig) CoreIngestSteps.backup_db(cig) CoreIngestSteps.backup_snapshots(cig) CoreIngestSteps.update_thumbnails(cig) CoreIngestSteps.parse_and_ocr(cig) CoreIngestSteps.load_files(cig) CoreIngestSteps.update_s3_snapshots(cig) CoreIngestSteps.refresh_materialized_tables(cig) CoreIngestSteps.update_es(cig) CoreIngestSteps.update_neo4j(cig) CoreIngestSteps.update_revocations(cig) CoreIngestSteps.update_crawler_status_completed(cig) announce("Pipeline Finished")
def update_s3_snapshots(c: CoreIngestConfig) -> None: """Uploads s3 snapshots of the raw+parsed corpus to s3, allowing for reversion if anything gets corrupted in the main prefixes""" announce("Updating raw/parsed snapshot locations in S3") c.snapshot_manager.update_current_snapshot_from_disk( local_dir=c.raw_doc_base_dir, snapshot_type=SnapshotType.RAW, replace=False, max_threads=c.max_s3_threads) c.snapshot_manager.update_current_snapshot_from_disk( local_dir=c.parsed_doc_base_dir, snapshot_type=SnapshotType.PARSED, replace=False, max_threads=c.max_s3_threads) c.snapshot_manager.update_current_snapshot_from_disk( local_dir=c.thumbnail_doc_base_dir, snapshot_type=SnapshotType.THUMBNAIL, replace=False, max_threads=c.max_s3_threads)
def update_s3_snapshots(c: CloneIngestConfig) -> None: announce("Updating raw/parsed snapshot locations in S3") c.snapshot_manager.update_current_snapshot_from_disk( local_dir=c.raw_doc_base_dir, snapshot_type=SnapshotType.RAW, replace=False, max_threads=c.max_threads ) c.snapshot_manager.update_current_snapshot_from_disk( local_dir=c.parsed_doc_base_dir, snapshot_type=SnapshotType.PARSED, replace=False, max_threads=c.max_threads ) c.snapshot_manager.update_current_snapshot_from_disk( local_dir=c.thumbnail_doc_base_dir, snapshot_type=SnapshotType.THUMBNAIL, replace=False, max_threads=c.max_threads )
def core_reparse(core_ingest_config: CoreIngestConfig, **kwargs): """Pipeline for pulling raw documents from s3, parsing, and reuploading/reindexing/populating neo4j""" announce('Pulling down raw snapshot files for parsing ...') core_ingest_config.snapshot_manager.pull_current_snapshot_to_disk( local_dir=core_ingest_config.raw_doc_base_dir, snapshot_type='raw', using_db=False, max_threads=core_ingest_config.max_threads) if not next((p for p in core_ingest_config.raw_doc_base_dir.iterdir() if p.is_file()), None): announce( "[WARNING] No files were found for processing, exiting pipeline.") exit(1) CoreIngestSteps.backup_snapshots(core_ingest_config) CoreIngestSteps.update_thumbnails(core_ingest_config) CoreIngestSteps.parse_and_ocr(core_ingest_config) CoreIngestSteps.update_es(core_ingest_config) CoreIngestSteps.update_neo4j(core_ingest_config) CoreIngestSteps.update_revocations(core_ingest_config) announce('Pushing up parsed files to s3 snapshot location ...') core_ingest_config.snapshot_manager.update_current_snapshot_from_disk( local_dir=core_ingest_config.parsed_doc_base_dir, snapshot_type='parsed', max_threads=core_ingest_config.max_threads) if core_ingest_config.force_ocr: announce('Pushing up raw files to s3 snapshot location ...') core_ingest_config.snapshot_manager.update_current_snapshot_from_disk( local_dir=core_ingest_config.raw_doc_base_dir, snapshot_type='raw', max_threads=core_ingest_config.max_threads) if not core_ingest_config.skip_thumbnail_generation: announce('Pushing up thumbnails to s3 snapshot location ...') core_ingest_config.snapshot_manager.update_current_snapshot_from_disk( local_dir=core_ingest_config.thumbnail_doc_base_dir, snapshot_type='thumbnails', max_threads=core_ingest_config.max_threads)
def core_local_ingest(core_ingest_config: CoreIngestConfig, **kwargs): """Pipeline for ingesting docs from local directories""" lic = LocalIngestConfig.from_core_config(core_config=core_ingest_config, other_config_kwargs=kwargs) if not next( (p for p in lic.raw_doc_base_dir.iterdir() if p.is_file()), None): announce( "[WARNING] No files were found for processing, exiting pipeline.") exit(1) CoreIngestSteps.update_crawler_status_downloaded(lic) CoreIngestSteps.update_crawler_status_in_progress(lic) CoreIngestSteps.backup_db(lic) CoreIngestSteps.backup_snapshots(lic) CoreIngestSteps.update_thumbnails(lic) if not lic.skip_parse: announce("Parsed files passed, skipping parsing.") CoreIngestSteps.parse_and_ocr(lic) CoreIngestSteps.load_files(lic) CoreIngestSteps.update_s3_snapshots(lic) CoreIngestSteps.refresh_materialized_tables(lic) CoreIngestSteps.update_es(lic) CoreIngestSteps.update_neo4j(lic) CoreIngestSteps.update_revocations(lic) CoreIngestSteps.update_crawler_status_completed(lic) announce("Pipeline Finished")
def core_update_thumbnails(core_ingest_config: CoreIngestConfig, **kwargs): """Pipeline for pulling down pdfs/metadata from s3 and updating thumbnails""" announce('Pulling down parsed snapshot files for updating neo4j ...') core_ingest_config.snapshot_manager.pull_current_snapshot_to_disk( local_dir=core_ingest_config.raw_doc_base_dir, snapshot_type='raw', using_db=False, max_threads=core_ingest_config.max_threads) if not next((p for p in core_ingest_config.raw_doc_base_dir.iterdir() if p.is_file()), None): announce( "[WARNING] No files were found for processing, exiting pipeline.") exit(1) announce('Updating thumbnails ...') CoreIngestSteps.update_thumbnails(core_ingest_config) announce('Pushing up thumbnails to s3 snapshot location ...') core_ingest_config.snapshot_manager.update_current_snapshot_from_disk( local_dir=core_ingest_config.thumbnail_doc_base_dir, snapshot_type='thumbnails', max_threads=core_ingest_config.max_threads)
def clone_reindex(clone_ingest_config: CloneIngestConfig, **kwargs): """Pipeline for pulling down jsons from s3 and reindexing into elasticsearch""" announce('Pulling down parsed snapshot files for reindexing ...') clone_ingest_config.snapshot_manager.pull_current_snapshot_to_disk( local_dir=clone_ingest_config.parsed_doc_base_dir, snapshot_type='parsed', using_db=False, max_threads=clone_ingest_config.max_threads) if not next((p for p in clone_ingest_config.parsed_doc_base_dir.iterdir() if p.is_file()), None): announce( "[WARNING] No files were found for processing, exiting pipeline.") exit(1) announce('Reindexing in elasticsearch ...') CloneIngestSteps.update_es(clone_ingest_config)
def core_update_neo4j(core_ingest_config: CoreIngestConfig, **kwargs): """Pipeline for pulling down jsons from s3 and repopulating neo4j""" announce('Pulling down parsed snapshot files for updating neo4j ...') core_ingest_config.snapshot_manager.pull_current_snapshot_to_disk( local_dir=core_ingest_config.parsed_doc_base_dir, snapshot_type='parsed', using_db=False, max_threads=core_ingest_config.max_threads) if not next((p for p in core_ingest_config.parsed_doc_base_dir.iterdir() if p.is_file()), None): announce( "[WARNING] No files were found for processing, exiting pipeline.") exit(1) announce('Updating neo4j ...') CoreIngestSteps.update_neo4j(core_ingest_config)
def core_manifest(core_ingest_config: CoreIngestConfig, **kwargs): """Pipeline for ingesting docs from local directories""" mc = ManifestConfig.from_core_config(core_config=core_ingest_config, other_config_kwargs=kwargs) # Setup Steps announce("Aggregating files for processing ...") announce( f"Downloading raw files from s3 prefix: {mc.s3_raw_ingest_prefix} ...") Config.s3_utils.download_dir(local_dir=mc.raw_doc_base_dir, prefix_path=mc.s3_raw_ingest_prefix, bucket=mc.bucket_name) CoreIngestSteps.backup_db(mc) CoreIngestSteps.backup_snapshots(mc) count_docs_copied = len(glob.glob(str(mc.raw_doc_base_dir) + "/*.*")) if count_docs_copied == 0: announce( "[WARNING] No files were downloaded for processing, exiting pipeline." ) exit(1) elif count_docs_copied > 1: # Ingest Steps -- Skipped if no files to ingest CoreIngestSteps.create_metadata_from_manifest(mc) CoreIngestSteps.parse_and_ocr(mc) CoreIngestSteps.update_thumbnails(mc) CoreIngestSteps.load_files(mc) CoreIngestSteps.update_s3_snapshots(mc) CoreIngestSteps.refresh_materialized_tables(mc) CoreIngestSteps.update_es(mc) CoreIngestSteps.update_neo4j(mc) # Delete Steps CoreIngestSteps.delete_from_elasticsearch(mc) CoreIngestSteps.delete_from_neo4j(mc) CoreIngestSteps.delete_from_db(mc) CoreIngestSteps.refresh_materialized_tables(mc) CoreIngestSteps.delete_from_s3(mc)
def clone_s3_ingest(clone_ingest_config: CloneIngestConfig, **kwargs): """Pipeline for parsing docs directly from s3""" sig = S3IngestConfig.from_clone_config(clone_config=clone_ingest_config, other_config_kwargs=kwargs) announce("Aggregating files for processing ...") announce( f"Downloading raw files from s3 prefix: {sig.s3_raw_ingest_prefix} ..." ) Config.s3_utils.download_dir(local_dir=sig.raw_doc_base_dir, prefix_path=sig.s3_raw_ingest_prefix, bucket=sig.bucket_name) # if not next((p for p in sig.raw_doc_base_dir.iterdir() if p.is_file()), None): # announce("[WARNING] No files were downloaded for processing, exiting pipeline.") # exit(1) CloneIngestSteps.create_metadata(sig) CloneIngestSteps.parse_and_ocr(sig) CloneIngestSteps.load_files(sig) CloneIngestSteps.update_s3_cloning(sig) CloneIngestSteps.update_es(sig) announce("Pipeline Finished")
def refresh_materialized_tables(c: CoreIngestConfig) -> None: announce( "Refreshing materialized tables for all databases (e.g. web snapshot table) ..." ) c.core_db_manager.refresh_materialized_tables_for_all_dbs()
def backup_snapshots(c: CoreIngestConfig) -> None: if not c.skip_snapshot_backup: announce("Backing up current snapshots ...") c.snapshot_manager.backup_all_current_snapshots( snapshot_ts=c.batch_timestamp)
def core_s3_ingest(core_ingest_config: CoreIngestConfig, **kwargs): """Pipeline for parsing docs directly from s3""" sig = S3IngestConfig.from_core_config(core_config=core_ingest_config, other_config_kwargs=kwargs) announce("Aggregating files for processing ...") announce( f"Downloading raw files from s3 prefix: {sig.s3_raw_ingest_prefix} ..." ) Config.s3_utils.download_dir(local_dir=sig.raw_doc_base_dir, prefix_path=sig.s3_raw_ingest_prefix, bucket=sig.bucket_name) if not next( (p for p in sig.raw_doc_base_dir.iterdir() if p.is_file()), None): announce( "[WARNING] No files were downloaded for processing, exiting pipeline." ) exit(1) CoreIngestSteps.create_metadata(sig) CoreIngestSteps.backup_db(sig) CoreIngestSteps.backup_snapshots(sig) if sig.s3_parsed_ingest_prefix: announce( f"Downloading parsed files from s3 prefix: {sig.s3_parsed_ingest_prefix} ..." ) Config.s3_utils.download_dir(local_dir=sig.parsed_doc_base_dir, prefix_path=sig.s3_parsed_ingest_prefix, bucket=sig.bucket_name) if not next( (p for p in sig.parsed_doc_base_dir.iterdir() if p.is_file()), None): announce( "[WARNING] No parsed files were downloaded for processing, exiting pipeline." ) exit(1) else: CoreIngestSteps.parse_and_ocr(sig) CoreIngestSteps.update_thumbnails(sig) CoreIngestSteps.load_files(sig) CoreIngestSteps.update_s3_snapshots(sig) CoreIngestSteps.refresh_materialized_tables(sig) CoreIngestSteps.update_es(sig) CoreIngestSteps.update_neo4j(sig) announce("Pipeline Finished")
def delete_from_s3(c: CoreIngestConfig) -> None: announce("Removing docs from S3 ...") remove_docs_from_current_snapshot(sm=c.snapshot_manager, removal_list=c.removal_list)
def create_metadata(sc: S3IngestConfig) -> None: if sc.metadata_creation_group: announce( "Creating metadata for files without existing metadata ...") sc.metadata_creater.create_metadata()
def delete_from_elasticsearch(c: CoreIngestConfig) -> None: announce("Removing docs from Elasticsearch ...") remove_docs_from_index(index_name=c.index_name, removal_list=c.removal_list)