Exemplo n.º 1
0
 def backup_db(c: CoreIngestConfig) -> None:
     if c.skip_db_backup:
         announce("Skipping DB backup ...")
     else:
         announce("Backing up DB(s) ...")
         c.core_db_manager.backup_all_tables_for_all_dbs(
             ts=c.batch_timestamp, job_dir=c.db_backup_dir)
Exemplo n.º 2
0
 def update_es(c: CloneIngestConfig) -> None:
     announce(f"Creating/Updating ES index: {c.index_name} ...")
     c.es_publisher.create_index()
     c.es_publisher.index_jsons()
     if c.alias_name:
         announce(f"Setting ES index('{c.index_name}') to alias('{c.alias_name}') ...")
         c.es_publisher.update_alias()
Exemplo n.º 3
0
    def delete_from_db(c: CoreIngestConfig) -> None:
        if c.skip_db_update:
            announce("Skip DB removal ...")
            return

        announce("Removing docs from DB ...")
        remove_docs_from_db(lm=c.load_manager, removal_list=c.db_tuple_list)
Exemplo n.º 4
0
    def update_thumbnails(c: CoreIngestConfig) -> None:
        """Uploads the .png thumbnails of the raw documents to the appropriate s3 prefix"""
        if c.skip_thumbnail_generation:
            announce("Skipping Thumbnails update [flag set] ...")
            return

        announce("Updating thumbnails ...")
        c.thumbnail_job_manager.process_directory()
Exemplo n.º 5
0
    def delete_from_neo4j(c: CoreIngestConfig) -> None:

        if c.skip_neo4j_update:
            announce("Skip Neo4j removal ...")
            return
        announce("Removing docs from Neo4j ...")
        remove_docs_from_neo4j(njm=c.neo4j_job_manager,
                               removal_list=c.removal_list)
Exemplo n.º 6
0
    def update_thumbnails(c: CloneIngestConfig) -> None:

        if c.skip_thumbnail_generation:
            announce("Skipping Thumbnails update [flag set] ...")
            return

        announce("Updating thumbnails ...")
        c.thumbnail_job_manager.process_directory()
Exemplo n.º 7
0
    def update_crawler_status_completed(c: CloneIngestConfig) -> None:
        if not c.crawler_output:
            announce("Skipping crawler_status table update [no crawler output file provided] ...")
            return

        announce("Updating crawler status table to Ingest Complete...")
        c.crawler_status_tracker.update_crawler_status(status="Ingest Complete",
                                                       timestamp=datetime.now(),
                                                       update_db=not c.skip_db_update)
Exemplo n.º 8
0
    def update_crawler_status_downloaded(c: CloneIngestConfig) -> None:

        if not c.crawler_output:
            announce("Skipping crawler_status table update [no crawler output file provided] ...")
            return

        announce("Updating crawler status table to Crawl and Download Complete...")
        c.crawler_status_tracker.update_crawler_status(status="Crawl and Download Complete",
                                                       timestamp=c.batch_timestamp,
                                                       update_db=not c.skip_db_update)
Exemplo n.º 9
0
 def parse_and_ocr(c: CloneIngestConfig) -> None:
     announce(f"Parsing and OCR'ing docs from '{c.raw_doc_base_dir}' ...")
     pdf_to_json(
         parser_path="common.document_parser.parsers.policy_analytics.parse::parse",
         source=str(c.raw_doc_base_dir),
         destination=str(c.parsed_doc_base_dir),
         metadata=str(c.raw_doc_base_dir),
         ocr_missing_doc=True,
         multiprocess=c.max_threads,
         num_ocr_threads=c.max_ocr_threads
     )
Exemplo n.º 10
0
 def update_neo4j(c: CoreIngestConfig) -> None:
     if c.skip_neo4j_update:
         announce("Skipping Neo4J update ...")
     else:
         announce("Updating Neo4J ...")
         c.neo4j_job_manager.run_update(source=c.parsed_doc_base_dir,
                                        clear=False,
                                        max_threads=c.max_threads_neo4j,
                                        scrape_wiki=False,
                                        without_web_scraping=True,
                                        infobox_dir=c.infobox_dir)
Exemplo n.º 11
0
    def update_revocations(c: CloneIngestConfig) -> None:

        if c.skip_revocation_update:
            announce("Skipping Revocations update [flag set] ...")
            return

        announce("Updating revocations ...")
        c.crawler_status_tracker.handle_revocations(index_name=c.index_name,
            update_db=not c.skip_db_update,
            update_es=not c.skip_es_revocation,
            update_neo4j=not c.skip_neo4j_update)
Exemplo n.º 12
0
 def load_files(c: CoreIngestConfig) -> None:
     """Runs the load function, updating the publications table, the versioned_docs table, and s3.
     Docs are only updated in versioned_docs if they are also uploaded to s3"""
     announce("Loading files into S3 & DB ...")
     c.load_manager.load(raw_dir=c.raw_doc_base_dir,
                         metadata_dir=c.raw_doc_base_dir,
                         parsed_dir=c.parsed_doc_base_dir,
                         ingest_ts=c.batch_timestamp,
                         update_s3=True,
                         max_threads=c.max_s3_threads,
                         update_db=not c.skip_db_update,
                         thumbnail_dir=c.thumbnail_doc_base_dir)
Exemplo n.º 13
0
 def load_files(c: CloneIngestConfig) -> None:
     announce("Loading files into S3 & DB ...")
     c.load_manager.load(
         raw_dir=c.raw_doc_base_dir,
         metadata_dir=c.raw_doc_base_dir,
         parsed_dir=c.parsed_doc_base_dir,
         ingest_ts=c.batch_timestamp,
         update_s3=True,
         max_threads=c.max_s3_threads,
         update_db=not c.skip_db_update,
         thumbnail_dir=c.thumbnail_doc_base_dir
     )
Exemplo n.º 14
0
def core_checkpoint_ingest(core_ingest_config: CoreIngestConfig, **kwargs):
    """Pipeline for parsing focs from checkpointed s3 prefixes"""
    cig = CheckpointIngestConfig.from_core_config(
        core_config=core_ingest_config, other_config_kwargs=kwargs)

    announce("Aggregating files for processing ...")
    announce(f"Aggregating files from checkpoints ...")
    last_prefix: t.Optional[TimestampedPrefix] = None
    with cig.checkpoint_manager.checkpoint_download_manager(
            base_download_dir=cig.download_base_dir,
            advance_checkpoint=cig.advance_checkpoint,
            limit=cig.checkpoint_limit if cig.checkpoint_limit > 0 else None,
            max_threads=cig.max_threads) as downloaded_prefixes:
        for dp in downloaded_prefixes:
            last_prefix = dp.timestamped_prefix
            for f in (p for p in dp.local_path.iterdir() if p.is_file()):
                shutil.copy(str(f), str(Path(cig.raw_doc_base_dir, f.name)))

    if not last_prefix:
        announce("There was nothing to do, skipping remainder of ingest ...")
        exit(0)

    if not next(
        (p for p in cig.raw_doc_base_dir.iterdir() if p.is_file()), None):
        announce(
            "[WARNING] No files were downloaded for processing, exiting pipeline."
        )
        exit(1)

    CoreIngestSteps.update_crawler_status_downloaded(cig)
    CoreIngestSteps.update_crawler_status_in_progress(cig)
    CoreIngestSteps.backup_db(cig)
    CoreIngestSteps.backup_snapshots(cig)
    CoreIngestSteps.update_thumbnails(cig)
    CoreIngestSteps.parse_and_ocr(cig)
    CoreIngestSteps.load_files(cig)
    CoreIngestSteps.update_s3_snapshots(cig)
    CoreIngestSteps.refresh_materialized_tables(cig)
    CoreIngestSteps.update_es(cig)
    CoreIngestSteps.update_neo4j(cig)
    CoreIngestSteps.update_revocations(cig)
    CoreIngestSteps.update_crawler_status_completed(cig)

    announce("Pipeline Finished")
Exemplo n.º 15
0
 def update_s3_snapshots(c: CoreIngestConfig) -> None:
     """Uploads s3 snapshots of the raw+parsed corpus to s3, allowing for reversion if anything gets corrupted
     in the main prefixes"""
     announce("Updating raw/parsed snapshot locations in S3")
     c.snapshot_manager.update_current_snapshot_from_disk(
         local_dir=c.raw_doc_base_dir,
         snapshot_type=SnapshotType.RAW,
         replace=False,
         max_threads=c.max_s3_threads)
     c.snapshot_manager.update_current_snapshot_from_disk(
         local_dir=c.parsed_doc_base_dir,
         snapshot_type=SnapshotType.PARSED,
         replace=False,
         max_threads=c.max_s3_threads)
     c.snapshot_manager.update_current_snapshot_from_disk(
         local_dir=c.thumbnail_doc_base_dir,
         snapshot_type=SnapshotType.THUMBNAIL,
         replace=False,
         max_threads=c.max_s3_threads)
Exemplo n.º 16
0
 def update_s3_snapshots(c: CloneIngestConfig) -> None:
     announce("Updating raw/parsed snapshot locations in S3")
     c.snapshot_manager.update_current_snapshot_from_disk(
         local_dir=c.raw_doc_base_dir,
         snapshot_type=SnapshotType.RAW,
         replace=False,
         max_threads=c.max_threads
     )
     c.snapshot_manager.update_current_snapshot_from_disk(
         local_dir=c.parsed_doc_base_dir,
         snapshot_type=SnapshotType.PARSED,
         replace=False,
         max_threads=c.max_threads
     )
     c.snapshot_manager.update_current_snapshot_from_disk(
         local_dir=c.thumbnail_doc_base_dir,
         snapshot_type=SnapshotType.THUMBNAIL,
         replace=False,
         max_threads=c.max_threads
     )
Exemplo n.º 17
0
def core_reparse(core_ingest_config: CoreIngestConfig, **kwargs):
    """Pipeline for pulling raw documents from s3, parsing, and reuploading/reindexing/populating neo4j"""
    announce('Pulling down raw snapshot files for parsing ...')
    core_ingest_config.snapshot_manager.pull_current_snapshot_to_disk(
        local_dir=core_ingest_config.raw_doc_base_dir,
        snapshot_type='raw',
        using_db=False,
        max_threads=core_ingest_config.max_threads)

    if not next((p for p in core_ingest_config.raw_doc_base_dir.iterdir()
                 if p.is_file()), None):
        announce(
            "[WARNING] No files were found for processing, exiting pipeline.")
        exit(1)
    CoreIngestSteps.backup_snapshots(core_ingest_config)
    CoreIngestSteps.update_thumbnails(core_ingest_config)
    CoreIngestSteps.parse_and_ocr(core_ingest_config)

    CoreIngestSteps.update_es(core_ingest_config)
    CoreIngestSteps.update_neo4j(core_ingest_config)
    CoreIngestSteps.update_revocations(core_ingest_config)

    announce('Pushing up parsed files to s3 snapshot location ...')
    core_ingest_config.snapshot_manager.update_current_snapshot_from_disk(
        local_dir=core_ingest_config.parsed_doc_base_dir,
        snapshot_type='parsed',
        max_threads=core_ingest_config.max_threads)

    if core_ingest_config.force_ocr:
        announce('Pushing up raw files to s3 snapshot location ...')
        core_ingest_config.snapshot_manager.update_current_snapshot_from_disk(
            local_dir=core_ingest_config.raw_doc_base_dir,
            snapshot_type='raw',
            max_threads=core_ingest_config.max_threads)

    if not core_ingest_config.skip_thumbnail_generation:
        announce('Pushing up thumbnails to s3 snapshot location ...')
        core_ingest_config.snapshot_manager.update_current_snapshot_from_disk(
            local_dir=core_ingest_config.thumbnail_doc_base_dir,
            snapshot_type='thumbnails',
            max_threads=core_ingest_config.max_threads)
Exemplo n.º 18
0
def core_local_ingest(core_ingest_config: CoreIngestConfig, **kwargs):
    """Pipeline for ingesting docs from local directories"""
    lic = LocalIngestConfig.from_core_config(core_config=core_ingest_config,
                                             other_config_kwargs=kwargs)

    if not next(
        (p for p in lic.raw_doc_base_dir.iterdir() if p.is_file()), None):
        announce(
            "[WARNING] No files were found for processing, exiting pipeline.")
        exit(1)
    CoreIngestSteps.update_crawler_status_downloaded(lic)
    CoreIngestSteps.update_crawler_status_in_progress(lic)
    CoreIngestSteps.backup_db(lic)
    CoreIngestSteps.backup_snapshots(lic)
    CoreIngestSteps.update_thumbnails(lic)
    if not lic.skip_parse:
        announce("Parsed files passed, skipping parsing.")
        CoreIngestSteps.parse_and_ocr(lic)
    CoreIngestSteps.load_files(lic)
    CoreIngestSteps.update_s3_snapshots(lic)
    CoreIngestSteps.refresh_materialized_tables(lic)
    CoreIngestSteps.update_es(lic)
    CoreIngestSteps.update_neo4j(lic)
    CoreIngestSteps.update_revocations(lic)
    CoreIngestSteps.update_crawler_status_completed(lic)

    announce("Pipeline Finished")
Exemplo n.º 19
0
def core_update_thumbnails(core_ingest_config: CoreIngestConfig, **kwargs):
    """Pipeline for pulling down pdfs/metadata from s3 and updating thumbnails"""
    announce('Pulling down parsed snapshot files for updating neo4j ...')
    core_ingest_config.snapshot_manager.pull_current_snapshot_to_disk(
        local_dir=core_ingest_config.raw_doc_base_dir,
        snapshot_type='raw',
        using_db=False,
        max_threads=core_ingest_config.max_threads)

    if not next((p for p in core_ingest_config.raw_doc_base_dir.iterdir()
                 if p.is_file()), None):
        announce(
            "[WARNING] No files were found for processing, exiting pipeline.")
        exit(1)

    announce('Updating thumbnails ...')
    CoreIngestSteps.update_thumbnails(core_ingest_config)

    announce('Pushing up thumbnails to s3 snapshot location ...')
    core_ingest_config.snapshot_manager.update_current_snapshot_from_disk(
        local_dir=core_ingest_config.thumbnail_doc_base_dir,
        snapshot_type='thumbnails',
        max_threads=core_ingest_config.max_threads)
Exemplo n.º 20
0
def clone_reindex(clone_ingest_config: CloneIngestConfig, **kwargs):
    """Pipeline for pulling down jsons from s3 and reindexing into elasticsearch"""
    announce('Pulling down parsed snapshot files for reindexing ...')
    clone_ingest_config.snapshot_manager.pull_current_snapshot_to_disk(
        local_dir=clone_ingest_config.parsed_doc_base_dir,
        snapshot_type='parsed',
        using_db=False,
        max_threads=clone_ingest_config.max_threads)

    if not next((p for p in clone_ingest_config.parsed_doc_base_dir.iterdir()
                 if p.is_file()), None):
        announce(
            "[WARNING] No files were found for processing, exiting pipeline.")
        exit(1)

    announce('Reindexing in elasticsearch ...')
    CloneIngestSteps.update_es(clone_ingest_config)
Exemplo n.º 21
0
def core_update_neo4j(core_ingest_config: CoreIngestConfig, **kwargs):
    """Pipeline for pulling down jsons from s3 and repopulating neo4j"""
    announce('Pulling down parsed snapshot files for updating neo4j ...')
    core_ingest_config.snapshot_manager.pull_current_snapshot_to_disk(
        local_dir=core_ingest_config.parsed_doc_base_dir,
        snapshot_type='parsed',
        using_db=False,
        max_threads=core_ingest_config.max_threads)

    if not next((p for p in core_ingest_config.parsed_doc_base_dir.iterdir()
                 if p.is_file()), None):
        announce(
            "[WARNING] No files were found for processing, exiting pipeline.")
        exit(1)

    announce('Updating neo4j ...')
    CoreIngestSteps.update_neo4j(core_ingest_config)
Exemplo n.º 22
0
def core_manifest(core_ingest_config: CoreIngestConfig, **kwargs):
    """Pipeline for ingesting docs from local directories"""
    mc = ManifestConfig.from_core_config(core_config=core_ingest_config,
                                         other_config_kwargs=kwargs)

    # Setup Steps
    announce("Aggregating files for processing ...")
    announce(
        f"Downloading raw files from s3 prefix: {mc.s3_raw_ingest_prefix} ...")
    Config.s3_utils.download_dir(local_dir=mc.raw_doc_base_dir,
                                 prefix_path=mc.s3_raw_ingest_prefix,
                                 bucket=mc.bucket_name)

    CoreIngestSteps.backup_db(mc)
    CoreIngestSteps.backup_snapshots(mc)
    count_docs_copied = len(glob.glob(str(mc.raw_doc_base_dir) + "/*.*"))
    if count_docs_copied == 0:
        announce(
            "[WARNING] No files were downloaded for processing, exiting pipeline."
        )
        exit(1)
    elif count_docs_copied > 1:
        # Ingest Steps -- Skipped if no files to ingest
        CoreIngestSteps.create_metadata_from_manifest(mc)
        CoreIngestSteps.parse_and_ocr(mc)

        CoreIngestSteps.update_thumbnails(mc)
        CoreIngestSteps.load_files(mc)
        CoreIngestSteps.update_s3_snapshots(mc)
        CoreIngestSteps.refresh_materialized_tables(mc)
        CoreIngestSteps.update_es(mc)
        CoreIngestSteps.update_neo4j(mc)

    # Delete Steps
    CoreIngestSteps.delete_from_elasticsearch(mc)

    CoreIngestSteps.delete_from_neo4j(mc)

    CoreIngestSteps.delete_from_db(mc)
    CoreIngestSteps.refresh_materialized_tables(mc)

    CoreIngestSteps.delete_from_s3(mc)
Exemplo n.º 23
0
def clone_s3_ingest(clone_ingest_config: CloneIngestConfig, **kwargs):
    """Pipeline for parsing docs directly from s3"""
    sig = S3IngestConfig.from_clone_config(clone_config=clone_ingest_config,
                                           other_config_kwargs=kwargs)
    announce("Aggregating files for processing ...")
    announce(
        f"Downloading raw files from s3 prefix: {sig.s3_raw_ingest_prefix} ..."
    )
    Config.s3_utils.download_dir(local_dir=sig.raw_doc_base_dir,
                                 prefix_path=sig.s3_raw_ingest_prefix,
                                 bucket=sig.bucket_name)

    #    if not next((p for p in sig.raw_doc_base_dir.iterdir() if p.is_file()), None):
    #        announce("[WARNING] No files were downloaded for processing, exiting pipeline.")
    #        exit(1)
    CloneIngestSteps.create_metadata(sig)

    CloneIngestSteps.parse_and_ocr(sig)
    CloneIngestSteps.load_files(sig)
    CloneIngestSteps.update_s3_cloning(sig)
    CloneIngestSteps.update_es(sig)

    announce("Pipeline Finished")
Exemplo n.º 24
0
 def refresh_materialized_tables(c: CoreIngestConfig) -> None:
     announce(
         "Refreshing materialized tables for all databases (e.g. web snapshot table) ..."
     )
     c.core_db_manager.refresh_materialized_tables_for_all_dbs()
Exemplo n.º 25
0
 def backup_snapshots(c: CoreIngestConfig) -> None:
     if not c.skip_snapshot_backup:
         announce("Backing up current snapshots ...")
         c.snapshot_manager.backup_all_current_snapshots(
             snapshot_ts=c.batch_timestamp)
Exemplo n.º 26
0
def core_s3_ingest(core_ingest_config: CoreIngestConfig, **kwargs):
    """Pipeline for parsing docs directly from s3"""
    sig = S3IngestConfig.from_core_config(core_config=core_ingest_config,
                                          other_config_kwargs=kwargs)

    announce("Aggregating files for processing ...")
    announce(
        f"Downloading raw files from s3 prefix: {sig.s3_raw_ingest_prefix} ..."
    )
    Config.s3_utils.download_dir(local_dir=sig.raw_doc_base_dir,
                                 prefix_path=sig.s3_raw_ingest_prefix,
                                 bucket=sig.bucket_name)

    if not next(
        (p for p in sig.raw_doc_base_dir.iterdir() if p.is_file()), None):
        announce(
            "[WARNING] No files were downloaded for processing, exiting pipeline."
        )
        exit(1)

    CoreIngestSteps.create_metadata(sig)

    CoreIngestSteps.backup_db(sig)
    CoreIngestSteps.backup_snapshots(sig)

    if sig.s3_parsed_ingest_prefix:
        announce(
            f"Downloading parsed files from s3 prefix: {sig.s3_parsed_ingest_prefix} ..."
        )
        Config.s3_utils.download_dir(local_dir=sig.parsed_doc_base_dir,
                                     prefix_path=sig.s3_parsed_ingest_prefix,
                                     bucket=sig.bucket_name)
        if not next(
            (p
             for p in sig.parsed_doc_base_dir.iterdir() if p.is_file()), None):
            announce(
                "[WARNING] No parsed files were downloaded for processing, exiting pipeline."
            )
            exit(1)
    else:
        CoreIngestSteps.parse_and_ocr(sig)

    CoreIngestSteps.update_thumbnails(sig)
    CoreIngestSteps.load_files(sig)
    CoreIngestSteps.update_s3_snapshots(sig)
    CoreIngestSteps.refresh_materialized_tables(sig)
    CoreIngestSteps.update_es(sig)
    CoreIngestSteps.update_neo4j(sig)

    announce("Pipeline Finished")
Exemplo n.º 27
0
    def delete_from_s3(c: CoreIngestConfig) -> None:

        announce("Removing docs from S3 ...")
        remove_docs_from_current_snapshot(sm=c.snapshot_manager,
                                          removal_list=c.removal_list)
Exemplo n.º 28
0
    def create_metadata(sc: S3IngestConfig) -> None:

        if sc.metadata_creation_group:
            announce(
                "Creating metadata for files without existing metadata ...")
            sc.metadata_creater.create_metadata()
Exemplo n.º 29
0
    def delete_from_elasticsearch(c: CoreIngestConfig) -> None:

        announce("Removing docs from Elasticsearch ...")

        remove_docs_from_index(index_name=c.index_name,
                               removal_list=c.removal_list)