def mirror_old_style_records_etl(): """Ensure the old-style records exist for all present new-style ones.""" start_time = datetime.datetime.now() LOG.info("Start: Mirror record documents in old-style naming.") conn = credential.UNCPathCredential(path.RLID_DATA_SHARE, **credential.RLID_DATA_SHARE) count = Counter() with conn: for doc_path in rlid_record_paths(): doc_name = os.path.basename(doc_path) doc_id, ext = os.path.splitext(doc_name) old_style_path = rlid_record_path_old(doc_id, ext) if not old_style_path: count["not in database"] += 1 elif os.path.exists(old_style_path): count["already mirrored"] += 1 elif place_record_old(doc_path): count["mirrored"] += 1 else: count["failed to mirror"] += 1 LOG.warning("%r failed to mirror to %r.", doc_name, old_style_path) document.log_state_counts(count, documents_type="records") LOG.info("End: Mirror.") elapsed(start_time, LOG)
def deeds_records_update(): """Run update for deeds & records documents RLID repository.""" start_time = datetime.datetime.now() PATH["logfile"] = os.path.join( PATH["staging"], "Deeds_Records_Update_{}.log".format(start_time.year)) conn = credential.UNCPathCredential(PATH["staging"], **credential.RLID_DATA_SHARE) with conn: # Attach logfile handler for staging logfile. logfile = logging.FileHandler(PATH["logfile"]) logfile.setLevel(logging.INFO) logfile.setFormatter(LOGFILE_FORMATTER) LOG.addHandler(logfile) LOG.info("START SCRIPT: Update RLID deeds & records repository.") LOG.info( "Start: Move deeds & records drop-files to staging directory.") drop_extensions = [".exe", ".pdf", ".zip" ] + document.IMAGE_FILE_EXTENSIONS for file_name in os.listdir(PATH["drop"]): file_path = os.path.join(PATH["drop"], file_name) file_extension = os.path.splitext(file_name)[-1].lower() if all( [os.path.isfile(file_path), file_extension in drop_extensions]): move_path = os.path.join(PATH["staging"], file_name) shutil.move(file_path, move_path) LOG.info("Moved %r to %r.", file_path, move_path) LOG.info("End: Move.") LOG.info("Start: Extract record archives.") count = Counter() for file_path in path.folder_file_paths(PATH["staging"]): if os.path.splitext(file_path)[-1].lower() in [".exe", ".zip"]: count[extract_records(file_path, archive_original=True)] += 1 document.log_state_counts(count, documents_type="archives") # D&R archives include a few log & reference files; delete if present. for file_path in path.folder_file_paths(PATH["staging"]): for pattern in ["_logfile", "_xreffile"]: if pattern.lower() in file_path.lower(): os.remove(file_path) LOG.info("Start: Replace record images with PDFs.") count = Counter() for file_path in path.folder_file_paths(PATH["staging"]): if (os.path.splitext(file_path)[-1].lower() in document.IMAGE_FILE_EXTENSIONS): count[convert_image(file_path, delete_original=True)] += 1 document.log_state_counts(count, documents_type="images") LOG.info("Start: Place record PDFs in RLID repository.") count = Counter() for file_path in path.folder_file_paths(PATH["staging"]): if os.path.splitext(file_path)[-1].lower() == ".pdf": old_state = place_record_old(file_path) new_state = place_record( file_path, delete_original=(old_state == "placed")) count.update([old_state, new_state]) document.log_state_counts(count, documents_type="records") elapsed(start_time, LOG) LOG.info("END SCRIPT")
def property_cards_staging_update(): """Run update for RLID assessor property card staging repository.""" LOG.info("Start: Update assessor property card staging repository.") start_time = datetime.datetime.now() source_paths = document.repository_file_paths(path.LANE_PROPERTY_CARDS) conn = credential.UNCPathCredential( path.RLID_DATA_STAGING_SHARE, **credential.RLID_DATA_SHARE ) with conn: count = Counter() for source_path in source_paths: staging_path = os.path.join( REPO_PATH["property-card-staging"], os.path.basename(source_path) ) if document.changed(staging_path, source_path): result_key = document.update_document(source_path, staging_path) count[result_key] += 1 LOG.info("End: Update.") document.log_state_counts(count, documents_type="property cards (staging)") elapsed(start_time, LOG)
def tax_maps_staging_update(): """Run update for RLID tax map staging repository.""" LOG.info("Start: Update tax map staging repository.") start_time = datetime.datetime.now() conn = credential.UNCPathCredential( path.RLID_DATA_STAGING_SHARE, **credential.RLID_DATA_SHARE ) with conn: count = Counter() for source_path in document.repository_file_paths(path.LANE_TAX_MAP_IMAGES): staging_path = os.path.join( REPO_PATH["tax-map-staging"], # Tax maps have a one-deep bin. os.path.split(os.path.dirname(source_path))[-1], os.path.basename(source_path), ) if document.changed(staging_path, source_path): result_key = document.update_document(source_path, staging_path) count[result_key] += 1 document.log_state_counts(count, documents_type="tax maps (staging)") elapsed(start_time, LOG) LOG.info("End: Update.")
def property_cards_update(): """Run update for assessor property card RLID production repository.""" LOG.info("Start: Update RLID assessor property card repository.") start_time = datetime.datetime.now() staging_paths = document.repository_file_paths( REPO_PATH["property-card-staging"], file_extensions=[".pdf"] ) conn = credential.UNCPathCredential( path.RLID_DATA_SHARE, **credential.RLID_DATA_SHARE ) with conn: count = Counter() for staging_path in staging_paths: rlid_path = rlid_document_path( os.path.basename(staging_path), document_type="property-card" ) if document.changed(rlid_path, staging_path): result_key = document.update_document(staging_path, rlid_path) count[result_key] += 1 LOG.info("End: Update.") document.log_state_counts(count, documents_type="property cards") elapsed(start_time, LOG)
def tax_maps_update(): """Run update for RLID tax map repository.""" start_time = datetime.datetime.now() conn = credential.UNCPathCredential( path.RLID_DATA_SHARE, **credential.RLID_DATA_SHARE ) with conn: # Attach logfile handler for repository update logfile. logfile = logging.FileHandler( os.path.join( REPO_PATH["tax-map"], "Tax_Map_Update_{}.log".format(start_time.year) ) ) logfile.setLevel(logging.INFO) logfile.setFormatter(LOGFILE_FORMATTER) LOG.addHandler(logfile) LOG.info("START SCRIPT: Update RLID tax map repository from staging.") file_name_release_date = tax_map_file_name_release_map( start_datetime=rlid_data_currency("Tax Maps") ) count = Counter() # Iterate through path/date map, adding, archiving & updating. for file_name, release_date in file_name_release_date.items(): rlid_path = rlid_document_path(file_name, document_type="tax-map") staging_path = rlid_document_path( file_name, document_type="tax-map-staging" ) result_key = update_tax_map( staging_path, rlid_path, release_date, archive_previous=True ) count[result_key] += 1 document.log_state_counts(count, documents_type="tax maps") # Finally, update tax map repository currency date (if we placed any). if count["updated"]: rlid_data_currency_setter("Tax Maps", max(file_name_release_date.values())) elapsed(start_time, LOG) LOG.info("END SCRIPT: Update")
def missing_in_rlid_etl(): """Run ETL for log of deeds & records documents missing in RLID.""" start_time = datetime.datetime.now() LOG.info( "Start: Compile table of deeds & records listed in Lane County records system," + " but not present in RLID repository.") conn = credential.UNCPathCredential(PATH["staging"], **credential.RLID_DATA_SHARE) csv_path = os.path.join(PATH["staging"], "Missing_in_RLID.csv") check_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") missing_count = 0 with conn: csvfile = open(csv_path, "wb") with csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(["document_id", "document_path", "check_time"]) for doc_path in rlid_record_paths(): if not os.path.exists(doc_path): doc_id = os.path.splitext(os.path.basename(doc_path))[0] csvwriter.writerow((doc_id, doc_path, check_time)) missing_count += 1 LOG.info("Found %s missing documents.", missing_count) LOG.info("End: Compile.") elapsed(start_time, LOG)
def tax_maps_not_in_source_etl(): """Run ETL for log of tax map documents in RLID but not source repository. We used to have an automatic check & retire for RLID tax maps that were no longer in the source repository. This pretty much retired the entire taxmap repository the night of 2015-05-07. This was because there appear to be times when the source repository is not reachable, and/or reports nothing in the source. For now, we will just log potential orphans. If you do need to "retire" a tax map no longer in use: 1. Make an archive copy of the document with this function call: ``` archive_tax_map( tax_map_path, archive_date=datetime.datetime.now(), is_replaced=False ) ``` 2. Move the document file to the `RetiredNoReplacement` subfolder. 3. Execute the following SQL statement: ``` if exists ( select 1 from RLID.dbo.Taxmap_Retired where image_filename = {file-name} ) begin; update RLID.dbo.Taxmap_Retired set date_retired = {same-date-as-archive-above} where image_filename = {file-name}; end; else begin; insert into RLID.dbo.Taxmap_Retired(image_filename, date_retired) values ({file-name}, {same-date-as-archive-above}); end; delete from RLID.dbo.Taxmap_Image where image_filename = {file-name};` """ start_time = datetime.datetime.now() LOG.info( "Start: Compile table of tax maps not mirrored between the Lane County & RLID" " repositories.\nAny tax maps in RLID not mirrored in the county repositoryare" " likely tax maps that no longer exist, and should be researched (and perhaps" " retired)." ) conn = credential.UNCPathCredential( path.RLID_DATA_SHARE, **credential.RLID_DATA_SHARE ) with conn: check_time = start_time.strftime("%Y-%m-%d %H:%M") file_names = { "County": { fixed_file_name(name) for _, _, filenames in os.walk(REPO_PATH["tax-map-staging"]) for name in filenames if name.lower().endswith(".pdf") }, "RLID": { fixed_file_name(name) for name in os.listdir(REPO_PATH["tax-map"]) if name.lower().endswith(".pdf") }, } for repo, other in permutations(["County", "RLID"]): LOG.info("Checking %s repository for tax maps not mirrored.", repo) unmirrored_file_names = sorted(file_names[repo] - file_names[other]) csv_path = os.path.join( REPO_PATH["tax-map"], "In_{}_Not_{}.csv".format(repo, other) ) csv_file = open(csv_path, "wb") with csv_file: csv_ = csv.writer(csv_file) csv_.writerow(("file_name", "check_time")) for file_name in unmirrored_file_names: csv_.writerow((file_name, check_time)) LOG.info( "Found %s tax maps in %s repository not mirrored in %s.", len(unmirrored_file_names), repo, other, ) LOG.info("End: Compile.") elapsed(start_time, LOG)