def main(test, ocns): """For a given list of OCNs, find all resolved OCNs clusters from the OCLC Concordance Table. Provide the OCNs list in space separated integers, for example: 1 123. cmd: pipenv run python oclc_lookup.py 1 123 returns: {(123, 18329830, 67524283), (1, 6567842, 9987701, 53095235, 433981287)} """ if test: click.echo("Running tests ...") tests() exit(0) configs = get_configs_by_filename("config", "cid_minting") primary_db_path = configs["primary_db_path"] cluster_db_path = configs["cluster_db_path"] PRIMARY_DB_PATH = os.environ.get( "OVERRIDE_PRIMARY_DB_PATH") or primary_db_path CLUSTER_DB_PATH = os.environ.get( "OVERRIDE_CLUSTER_DB_PATH") or cluster_db_path ocns_list = list(int(ocn) for ocn in ocns) if ocns_list: clusters = get_clusters_by_ocns(ocns_list, PRIMARY_DB_PATH, CLUSTER_DB_PATH) click.echo(clusters) exit(0) else: ctx = click.get_current_context() click.echo(ctx.get_help()) exit(1)
def main(): if (len(sys.argv) > 1): env = sys.argv[1] else: env = "test" configs = get_configs_by_filename('config', 'zephir_db') print(configs) db_connect_str = str(utils.db_connect_url(configs[env])) ocns_list = [6758168, 15437990, 5663662, 33393343, 28477569, 8727632] results = zephir_clusters_lookup(db_connect_str, ocns_list) print(results)
def main(): if (len(sys.argv) > 1): env = sys.argv[1] else: env = "dev" configs= get_configs_by_filename('config', 'zephir_db') db_connect_str = str(utils.db_connect_url(configs[env])) #test_zephir_search(db_connect_str) if len(sys.argv) > 2: input_filename = sys.argv[2] else: input_filename = "./data/htids.txt" if len(sys.argv) > 3: output_filename = sys.argv[3] else: output_filename = "./output/marc_records.xml" outfile = open(output_filename, 'w') outfile.write("<collection xmlns=\"http://www.loc.gov/MARC21/slim\">\n"); outfile.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") with open(input_filename) as infile: for line in infile: id = line.strip() records = find_marcxml_records_by_id(db_connect_str, id) for record in records: marcxml = re.sub("<\?xml version=\"1.0\" encoding=\"UTF-8\"\?>\n", "", record["metadata"]) marcxml = re.sub(" xmlns=\"http://www.loc.gov/MARC21/slim\"", "", marcxml) outfile.write(marcxml) outfile.write("</collection>\n") outfile.close() print("output marcxml file: {}".format(output_filename))
def main(): """ Performs read and write actions to the cid_minting_store table which stores the identifier and CID in pairs. Command line arguments: argv[1]: Server environemnt (Required). Can be test, dev, stg, or prd. argv[2]: Action. Can only be 'read' or 'write' argv[3]: Data type. Can only be 'ocn' and 'sysid' argv[4]: Data. OCNs or a local system ID. OCNs format: Comma separated strings without spaces in between any two values. For example: 8727632,32882115 Local system ID: a string. argv[5]: A CID. Only required when Action='write' """ if (len(sys.argv) != 5 and len(sys.argv) != 6): print("Parameter error.") usage(sys.argv[0]) exit(1) env = sys.argv[1] action = sys.argv[2] data_type = sys.argv[3] data = sys.argv[4] cid = None if len(sys.argv) == 6: cid = sys.argv[5] if env not in ["test", "dev", "stg", "prd"]: usage(sys.argv[0]) exit(1) if action not in ["read", "write"]: usage(sys.argv[0]) exit(1) if data_type not in ["ocn", "sysid"]: usage(sys.argv[0]) exit(1) if action == "write" and cid == None: usage(sys.argv[0]) exit(1) cmd_options = "cmd options: {} {} {} {}".format(env, action, data_type, data) if cid: cmd_options += " " + cid configs = get_configs_by_filename('config', 'cid_minting') logfile = configs['logpath'] db_config = str(utils.db_connect_url(configs[env]['minter_db'])) logging.basicConfig( level=logging.DEBUG, filename=logfile, format="%(asctime)s %(levelname)-4s %(message)s", ) logging.info("Start " + os.path.basename(__file__)) logging.info(cmd_options) DB_CONNECT_STR = os.environ.get('OVERRIDE_DB_CONNECT_STR') or db_config db = prepare_database(DB_CONNECT_STR) engine = db['engine'] session = db['session'] CidMintingStore = db['table'] results = {} if action == "read": if data_type == "ocn": results = find_cids_by_ocns(engine, data.split(",")) if data_type == "sysid": results = find_cid_by_sysid(CidMintingStore, session, data) engine.dispose() print(json.dumps(results)) exit(0) if action == "write": record = CidMintingStore(type=data_type, identifier=data, cid=cid) inserted = insert_a_record(session, record) engine.dispose() if inserted != "Success": exit(1) else: exit(0)
def main(): """ Retrieves Zephir clusters by OCNs. Command line arguments: argv[1]: Server environemnt (Required). Can be dev, stg, or prd. argv[2]: List of OCNs (Optional). Comma separated strings without spaces in between any two values. For example: 1,6567842,6758168,8727632 When OCNs present: 1. retrieves Zephir clusters by given OCNs; 2. return Zephir clusters in JSON string. When OCNs is not present: 1. find OCNs from the next input file; 2. retrieves Zephir clusters by given OCNs; 3. write Zephir clusters in JSON string to output file; 4. repeat 1-3 indefinitely or when there are no input files for 10 minutes. """ if (len(sys.argv) != 2 and len(sys.argv) != 3): print("Parameter error.") print("Usage: {} env[dev|stg|prd] optional_comma_separated_ocns".format(sys.argv[0])) print("{} dev".format(sys.argv[0])) print("{} dev 1,6567842,6758168,8727632".format(sys.argv[0])) exit(1) env = sys.argv[1] if env not in ["test", "dev", "stg", "prd"]: usage(sys.argv[0]) exit(1) zephir_db_config = get_configs_by_filename("config", "zephir_db") db_connect_url = str(utils.db_connect_url(zephir_db_config[env])) cid_minting_config = get_configs_by_filename("config", "cid_minting") primary_db_path = cid_minting_config["primary_db_path"] cluster_db_path = cid_minting_config["cluster_db_path"] logfile = cid_minting_config['logpath'] cid_inquiry_data_dir = cid_minting_config['cid_inquiry_data_dir'] cid_inquiry_done_dir = cid_minting_config['cid_inquiry_done_dir'] logging.basicConfig( level=logging.DEBUG, filename=logfile, format="%(asctime)s %(levelname)-4s %(message)s", ) logging.info("Start " + os.path.basename(__file__)) logging.info("Env: {}".format(env)) DB_CONNECT_STR = os.environ.get("OVERRIDE_DB_CONNECT_STR") or db_connect_url PRIMARY_DB_PATH = os.environ.get("OVERRIDE_PRIMARY_DB_PATH") or primary_db_path CLUSTER_DB_PATH = os.environ.get("OVERRIDE_CLUSTER_DB_PATH") or cluster_db_path if (len(sys.argv) == 3): ocns_list = convert_comma_separated_str_to_int_list(sys.argv[2]) results = cid_inquiry(ocns_list, DB_CONNECT_STR, PRIMARY_DB_PATH, CLUSTER_DB_PATH) print(json.dumps(results)) exit(0) run_process = True while run_process: for file in os.listdir(cid_inquiry_data_dir): if file.endswith(".txt"): output_filename = os.path.join(cid_inquiry_data_dir, file) done_filename = os.path.join(cid_inquiry_done_dir, file + ".done") ocns_from_filename = file[37:][:-4] ocns_list = convert_comma_separated_str_to_int_list(ocns_from_filename) results = cid_inquiry(ocns_list, DB_CONNECT_STR, PRIMARY_DB_PATH, CLUSTER_DB_PATH) with open(output_filename, 'w') as output_file: output_file.write(json.dumps(results)) os.rename(output_filename, done_filename)
def main(): if (len(sys.argv) > 1): env = sys.argv[1] else: env = "dev" configs= get_configs_by_filename('config', 'zephir_db') db_connect_str = str(utils.db_connect_url(configs[env])) #test_zephir_search(db_connect_str) #test_match if len(sys.argv) > 2: input_file = sys.argv[2] else: input_file = "./data/cids_with_multi_primary_ocns.csv" if len(sys.argv) > 3: output_file = sys.argv[3] else: output_file = "./output/cids_with_multi_primary_ocns_similarity_scores.csv" csv_columns = ["cid", "contribsys_id", "flag", "title_key", "lang", "similarity_ratio", "partial_ratio", "token_sort", "token_set"] count = 0 with open(input_file) as infile, open(output_file, 'w') as outfile: reader = csv.reader(infile) next(reader, None) # skip the headers writer = csv.DictWriter(outfile, fieldnames=csv_columns) writer.writeheader() for fields in reader: count += 1 if len(fields) > 0: # left padding 0s to CID cid = ("000000000" + fields[0])[-9:] #print (cid) results = find_zephir_titles_by_cid(db_connect_str, cid) first_item = True for result in results: if first_item: title_key = result["title_key"] first_item = False result_base = { "cid": result["cid"], "contribsys_id": result["contribsys_id"], "flag": "B", "title_key" : result["title_key"], "lang" : result["lang"].decode() if result["lang"] else "", } else: ratios = FuzzyRatios(title_key, result["title_key"]) #print (ratios.fuzzy_ratio) result_pair = { "cid": result["cid"], "contribsys_id": result["contribsys_id"], "flag": "", "title_key" : result["title_key"], "lang" : result["lang"].decode() if result["lang"] else "", "similarity_ratio": ratios.fuzzy_ratio, "partial_ratio": ratios.fuzzy_partial_ratio, "token_sort": ratios.fuzzy_token_sort_ratio, "token_set": ratios.fuzzy_token_set_ratio, } if (ratios.fuzzy_ratio >= 30 and ratios.fuzzy_ratio <= 70): writer.writerow(result_base) writer.writerow(result_pair)