def generate_cli( ctx, export_type, verbosity, merge_version, output_path, cache_path, force ): """Generate Zephir exports files for HathiTrust.""" console = ConsoleMessenger(app="ZEPHIR-EXPORT", verbosity=verbosity) if cache_path and os.path.exists(cache_path): console.debug("Using existing cache {}".format(cache_path)) cache = cache_path else: cache = ht_bib_cache( console=console, cache_path=cache_path, merge_version=merge_version, force=force, ) if export_type == "ht-bib-full": ht_bib_full( console=console, cache_path=cache, output_path=output_path, merge_version=merge_version, force=force, ) elif export_type == "ht-bib-incr": ht_bib_incr( console=console, cache_path=cache, merge_version=merge_version, force=force )
def compare_cache_cli(ctx, files, verbosity): """Compare export caches for content differences. Ignores datetime of cache creation.""" console = ConsoleMessenger(app="ZEPHIR-EXPORT", verbosity=verbosity) f1_cache = ExportCache(path=set_abs_filepath(files[0])) f1_set = f1_cache.frozen_content_set() f2_cache = ExportCache(path=set_abs_filepath(files[1])) f2_set = f2_cache.frozen_content_set() if hash(f1_set) != hash(f2_set): for line in f1_set - f2_set: console.out("-(cid:{},key:{})".format(line[0], line[1])) for line in f2_set - f1_set: console.out("+(cid:{},key:{})".format(line[0], line[1])) console.info("Differences found between cache files") else: console.info("No differences found between cache files")
def test_create_with_custom_cache_output(td_tmpdir, env_setup, capsys, pytestconfig): # SETUP TODO (cscollett: there may be a better place to put this) # set temp current working directory real_cwd = os.getcwd() os.chdir(td_tmpdir) for merge_version in ["v2", "v3"]: shutil.copyfile( os.path.join(td_tmpdir, "cache-{}-ref.db".format(merge_version)), os.path.join(td_tmpdir, "my_custom_cache.db"), ) console = ConsoleMessenger(verbosity=pytestconfig.getoption("verbose")) ht_bib_incr( console=console, cache_path="my_custom_cache.db", output_path="my_custom_output.json", merge_version=merge_version, force=True, ) assert filecmp.cmp( os.path.join(td_tmpdir, "my_custom_output.json"), os.path.join( td_tmpdir, "{}-ht_bib_export_incr_ref.json".format(merge_version)), ) # clean up to avoid name conflict next merge-version os.remove(os.path.join(td_tmpdir, "my_custom_output.json")) os.chdir(real_cwd)
def test_create_bib_export_incr(td_tmpdir, env_setup, capsys, pytestconfig): for merge_version in ["v2", "v3"]: os.rename( os.path.join(td_tmpdir, "cache-{}-ref.db".format(merge_version)), os.path.join( td_tmpdir, "cache-{}-{}.db".format( merge_version, datetime.datetime.today().strftime("%Y-%m-%d")), ), ) console = ConsoleMessenger(verbosity=pytestconfig.getoption("verbose")) ht_bib_incr(console=console, merge_version=merge_version, force=True) export_filename = "ht_bib_export_incr_{}.json".format( datetime.datetime.today().strftime("%Y-%m-%d")) assert filecmp.cmp( os.path.join(td_tmpdir, export_filename), os.path.join( td_tmpdir, "{}-ht_bib_export_incr_ref.json".format(merge_version)), ) # clean up to avoid name conflict next merge-version os.remove(os.path.join(td_tmpdir, export_filename))
def compare_file_cli(ctx, files, verbosity): """Compare export files for content differences.""" console = ConsoleMessenger(app="ZEPHIR-EXPORT", verbosity=verbosity) count = 0 with open(files[0]) as a, open(files[1]) as b: for line_a in a: count += 1 if line_a != b.readline(): console.info("Differences start on line: {}".format(count)) raise SystemExit(0) console.info("No differences found between files")
def audit(filepath, quiet, verbose, dry_run, suffix): """Audit.py: Audit ZED log file to ensure all the data is represented in the database""" # Print handler to manage when and how messages should print console = ConsoleMessenger(quiet, verbose) # REQUIREMENTS if len(filepath) == 0: console.error("No files given to process.") sys.exit(1) # APPLICATION SETUP # load environment env = environs.Env() env.read_env() ROOT_PATH = os.environ.get("ZED_ROOT_PATH") or os.path.dirname(__file__) ENV = os.environ.get("ZED_ENV") CONFIG_PATH = os.environ.get("ZED_CONFIG_PATH") or os.path.join( ROOT_PATH, "config") OVERRIDE_CONFIG_PATH = os.environ.get("ZED_OVERRIDE_CONFIG_PATH") # load all configuration files in directory config = utils.load_config(CONFIG_PATH) # used in testing, config files in test data will override local config files if OVERRIDE_CONFIG_PATH is not None: config = utils.load_config(OVERRIDE_CONFIG_PATH, config) # Print handler to manage when/where messages should print console = ConsoleMessenger(quiet, verbose) # DATABASE SETUP # Create database client, connection manager. db = config.get("zed_db", {}).get(ENV) DB_CONNECT_STR = str(utils.db_connect_url(db)) engine = sqla.create_engine(DB_CONNECT_STR) # Create classes through reflection Base = sqla_automap.automap_base() Base.prepare(engine, reflect=True) Event = Base.classes.events # Create a session to the database. Session = sqla.orm.sessionmaker() Session.configure(bind=engine) session = Session() if dry_run: console.diagnostic("DRY RUN") # Iterate over the json log files to process for file in filepath: if not os.path.isfile(file): console.error( "File path '{0}' does not exist. Exiting...".format(file)) break # # Get the file name, path, and create destination file name, path f_path, f_name = os.path.split(file) renamed_file = os.path.join("{0}.{1}".format(file, suffix)) if os.path.isfile(renamed_file): console.error( "Audit file '{0}' already exists.".format(renamed_file)) break log_events = [] db_events = set() file_pass = True # Assume valid until line found invalid # Open file and process with open(file) as f_io: ln_cnt = 0 console.diagnostic("Auditing: " + file) for line in f_io: ln_cnt += 1 try: log_events.append(json.loads(line.strip())) except json.decoder.JSONDecodeError: file_pass = False console.error( "ERROR: Innvalid JSON on line {0}".format(ln_cnt)) break # invalid json, stop successive validation routines if file_pass and len(log_events) > 0: query_params = { "event_type": log_events[0]["type"], "first_timestamp": (iso8601.parse_date(log_events[0]["timestamp"]) - datetime.timedelta(seconds=60)).isoformat("T"), "last_timestamp": (iso8601.parse_date(log_events[-1]["timestamp"]) + datetime.timedelta(seconds=60)).isoformat("T"), } session = Session() try: query = (session.query(Event.event_key).filter( Event.timestamp >= query_params["first_timestamp"]).filter( Event.timestamp <= query_params["last_timestamp"]). filter(Event.type == query_params["event_type"])) for event in query.all(): db_events.add(event.event_key) except Exception as e: session.rollback() raise e finally: session.close() for event in log_events: if not event["event"] in db_events: file_pass = False console.error( "ERROR: Missing event {0} in database.".format( event["event"])) # Report results if file_pass is False: console.error("File {0}: fail.".format(file)) else: if not dry_run: os.rename(file, renamed_file) console.report("File {0}: pass. {1} event(s) audited.\ ".format(file, len(log_events))) console.report("Done!") sys.exit(0)
def validate(filepath, quiet, verbose, dry_run, suffix): """validate.py: Validate ZED log file to ensure all the data is JSON and conforms to schemas""" # Print handler to manage when and how messages should print console = ConsoleMessenger(quiet, verbose) # REQUIREMENTS if len(filepath) == 0: console.error("No files given to process.") sys.exit(1) # APPLICATION SETUP # load environment env = environs.Env() env.read_env() schema_file = os.path.join(os.path.dirname(__file__), "config/zed_schema.json") with open(schema_file, "r") as f: schema_data = f.read() schema = json.loads(schema_data) if dry_run: console.diagnostic("DRY RUN") # Iterate over the json log files to process for file in filepath: if not os.path.isfile(file): console.error("File path '{0}' does not exist.".format(file)) break # Get the file name, path, and create destination file name, path f_path, f_name = os.path.split(file) renamed_file = os.path.join("{0}.{1}".format(file, suffix)) if os.path.isfile(renamed_file): console.error( "Validated file '{0}' already exists.".format(renamed_file)) break # Open file and validate with open(file) as f_io: event_counter = defaultdict(int) file_valid = True # Assume valid until line found invalid ln_cnt = 0 console.diagnostic("Validating: {}".format(file)) for line in f_io: ln_cnt += 1 # JSON VALIDATION BLOCK try: event = json.loads(line.strip()) jsonschema.validate(event, schema) except json.decoder.JSONDecodeError: file_valid = False console.error("Invalid JSON on line {0}".format(ln_cnt)) break except jsonschema.exceptions.ValidationError: file_valid = False console.error( "JSON Validation error on line {0}".format(ln_cnt)) break # DUPE-DETECTION BLOCK event_counter[event["event"]] += 1 if event_counter[event["event"]] > 1: file_valid = False console.error("Duplicate ID ({0}) found on line {1} \ ".format(event["event"], ln_cnt)) break # Report results if file_valid is False: console.error("File {0}: invalid.".format(file)) else: if not dry_run: os.rename(file, renamed_file) console.report( "File {0}: valid. {1} event(s) validated.".format( file, ln_cnt)) console.report("Done!") print(filepath) sys.exit(0)