示例#1
0
def main(test, ocns):
    """For a given list of OCNs, find all resolved OCNs clusters from the OCLC Concordance Table.

    Provide the OCNs list in space separated integers, for example: 1 123.

    cmd: pipenv run python oclc_lookup.py 1 123

    returns: {(123, 18329830, 67524283), (1, 6567842, 9987701, 53095235, 433981287)}
    """
    if test:
        click.echo("Running tests ...")
        tests()
        exit(0)

    configs = get_configs_by_filename("config", "cid_minting")
    primary_db_path = configs["primary_db_path"]
    cluster_db_path = configs["cluster_db_path"]

    PRIMARY_DB_PATH = os.environ.get(
        "OVERRIDE_PRIMARY_DB_PATH") or primary_db_path
    CLUSTER_DB_PATH = os.environ.get(
        "OVERRIDE_CLUSTER_DB_PATH") or cluster_db_path

    ocns_list = list(int(ocn) for ocn in ocns)
    if ocns_list:
        clusters = get_clusters_by_ocns(ocns_list, PRIMARY_DB_PATH,
                                        CLUSTER_DB_PATH)
        click.echo(clusters)
        exit(0)
    else:
        ctx = click.get_current_context()
        click.echo(ctx.get_help())
        exit(1)
示例#2
0
def main():
    if (len(sys.argv) > 1):
        env = sys.argv[1]
    else:
        env = "test"

    configs = get_configs_by_filename('config', 'zephir_db')
    print(configs)

    db_connect_str = str(utils.db_connect_url(configs[env]))

    ocns_list = [6758168, 15437990, 5663662, 33393343, 28477569, 8727632]

    results = zephir_clusters_lookup(db_connect_str, ocns_list)
    print(results)
示例#3
0
def main():
    if (len(sys.argv) > 1):
        env = sys.argv[1]
    else:
        env = "dev"

    configs= get_configs_by_filename('config', 'zephir_db')
    db_connect_str = str(utils.db_connect_url(configs[env]))

    #test_zephir_search(db_connect_str)

    if len(sys.argv) > 2:
        input_filename = sys.argv[2]
    else:
        input_filename = "./data/htids.txt"
    if len(sys.argv) > 3:
        output_filename = sys.argv[3]
    else:
        output_filename = "./output/marc_records.xml"

    outfile = open(output_filename, 'w')
    outfile.write("<collection xmlns=\"http://www.loc.gov/MARC21/slim\">\n");
    outfile.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")

    with open(input_filename) as infile:
        for line in infile:
            id = line.strip()
            records = find_marcxml_records_by_id(db_connect_str, id)
            for record in records:
                marcxml = re.sub("<\?xml version=\"1.0\" encoding=\"UTF-8\"\?>\n", "", record["metadata"])
                marcxml = re.sub(" xmlns=\"http://www.loc.gov/MARC21/slim\"", "", marcxml)
                outfile.write(marcxml)

    outfile.write("</collection>\n")
    outfile.close()

    print("output marcxml file: {}".format(output_filename))
示例#4
0
def main():
    """ Performs read and write actions to the cid_minting_store table which stores the identifier and CID in pairs.
        Command line arguments:
        argv[1]: Server environemnt (Required). Can be test, dev, stg, or prd.
        argv[2]: Action. Can only be 'read' or 'write'
        argv[3]: Data type. Can only be 'ocn' and 'sysid'
        argv[4]: Data. OCNs or a local system ID.
                 OCNs format:
                   Comma separated strings without spaces in between any two values.
                   For example: 8727632,32882115
                 Local system ID: a string.
        argv[5]: A CID. Only required when Action='write'
    """

    if (len(sys.argv) != 5 and len(sys.argv) != 6):
        print("Parameter error.")
        usage(sys.argv[0])
        exit(1)

    env = sys.argv[1]
    action = sys.argv[2]
    data_type = sys.argv[3]
    data = sys.argv[4]
    cid = None
    if len(sys.argv) == 6:
        cid = sys.argv[5]

    if env not in ["test", "dev", "stg", "prd"]:
        usage(sys.argv[0])
        exit(1)

    if action not in ["read", "write"]:
        usage(sys.argv[0])
        exit(1)

    if data_type not in ["ocn", "sysid"]:
        usage(sys.argv[0])
        exit(1)

    if action == "write" and cid == None:
        usage(sys.argv[0])
        exit(1)

    cmd_options = "cmd options: {} {} {} {}".format(env, action, data_type,
                                                    data)
    if cid:
        cmd_options += " " + cid

    configs = get_configs_by_filename('config', 'cid_minting')
    logfile = configs['logpath']
    db_config = str(utils.db_connect_url(configs[env]['minter_db']))

    logging.basicConfig(
        level=logging.DEBUG,
        filename=logfile,
        format="%(asctime)s %(levelname)-4s %(message)s",
    )
    logging.info("Start " + os.path.basename(__file__))
    logging.info(cmd_options)

    DB_CONNECT_STR = os.environ.get('OVERRIDE_DB_CONNECT_STR') or db_config

    db = prepare_database(DB_CONNECT_STR)
    engine = db['engine']
    session = db['session']
    CidMintingStore = db['table']

    results = {}
    if action == "read":
        if data_type == "ocn":
            results = find_cids_by_ocns(engine, data.split(","))

        if data_type == "sysid":
            results = find_cid_by_sysid(CidMintingStore, session, data)

        engine.dispose()
        print(json.dumps(results))
        exit(0)

    if action == "write":
        record = CidMintingStore(type=data_type, identifier=data, cid=cid)
        inserted = insert_a_record(session, record)
        engine.dispose()
        if inserted != "Success":
            exit(1)
        else:
            exit(0)
示例#5
0
def main():
    """ Retrieves Zephir clusters by OCNs.
        Command line arguments:
        argv[1]: Server environemnt (Required). Can be dev, stg, or prd.
        argv[2]: List of OCNs (Optional).
                 Comma separated strings without spaces in between any two values.
                 For example: 1,6567842,6758168,8727632
                 When OCNs present: 
                   1. retrieves Zephir clusters by given OCNs;
                   2. return Zephir clusters in JSON string.
                 When OCNs is not present:
                   1. find OCNs from the next input file;
                   2. retrieves Zephir clusters by given OCNs;
                   3. write Zephir clusters in JSON string to output file;
                   4. repeat 1-3 indefinitely or when there are no input files for 10 minutes.
    """

    if (len(sys.argv) != 2 and len(sys.argv) != 3):
        print("Parameter error.")
        print("Usage: {} env[dev|stg|prd] optional_comma_separated_ocns".format(sys.argv[0]))
        print("{} dev".format(sys.argv[0]))
        print("{} dev 1,6567842,6758168,8727632".format(sys.argv[0]))
        exit(1)

    env = sys.argv[1]
    if env not in ["test", "dev", "stg", "prd"]:
        usage(sys.argv[0])
        exit(1)

    zephir_db_config = get_configs_by_filename("config", "zephir_db")
    db_connect_url = str(utils.db_connect_url(zephir_db_config[env]))

    cid_minting_config = get_configs_by_filename("config", "cid_minting")
    primary_db_path = cid_minting_config["primary_db_path"]
    cluster_db_path = cid_minting_config["cluster_db_path"]
    logfile = cid_minting_config['logpath']
    cid_inquiry_data_dir = cid_minting_config['cid_inquiry_data_dir']
    cid_inquiry_done_dir = cid_minting_config['cid_inquiry_done_dir']

    logging.basicConfig(
            level=logging.DEBUG,
            filename=logfile,
            format="%(asctime)s %(levelname)-4s %(message)s",
        )
    logging.info("Start " + os.path.basename(__file__))
    logging.info("Env: {}".format(env))

    DB_CONNECT_STR = os.environ.get("OVERRIDE_DB_CONNECT_STR") or db_connect_url
    PRIMARY_DB_PATH = os.environ.get("OVERRIDE_PRIMARY_DB_PATH") or primary_db_path
    CLUSTER_DB_PATH = os.environ.get("OVERRIDE_CLUSTER_DB_PATH") or cluster_db_path

    if (len(sys.argv) == 3):
        ocns_list = convert_comma_separated_str_to_int_list(sys.argv[2])

        results = cid_inquiry(ocns_list, DB_CONNECT_STR, PRIMARY_DB_PATH, CLUSTER_DB_PATH)
        print(json.dumps(results))

        exit(0)

    run_process = True
    while run_process:
        for file in os.listdir(cid_inquiry_data_dir):
            if file.endswith(".txt"):
                output_filename = os.path.join(cid_inquiry_data_dir, file)
                done_filename = os.path.join(cid_inquiry_done_dir, file + ".done")

                ocns_from_filename = file[37:][:-4]
                ocns_list = convert_comma_separated_str_to_int_list(ocns_from_filename)
                results = cid_inquiry(ocns_list, DB_CONNECT_STR, PRIMARY_DB_PATH, CLUSTER_DB_PATH)

                with open(output_filename, 'w') as output_file:
                    output_file.write(json.dumps(results))

                os.rename(output_filename, done_filename)
示例#6
0
def main():
    if (len(sys.argv) > 1):
        env = sys.argv[1]
    else:
        env = "dev"

    configs= get_configs_by_filename('config', 'zephir_db')
    db_connect_str = str(utils.db_connect_url(configs[env]))

    #test_zephir_search(db_connect_str)
    #test_match

    if len(sys.argv) > 2:
        input_file = sys.argv[2]
    else:
        input_file = "./data/cids_with_multi_primary_ocns.csv"
    if len(sys.argv) > 3:
        output_file = sys.argv[3]
    else:
        output_file = "./output/cids_with_multi_primary_ocns_similarity_scores.csv"

    csv_columns = ["cid", "contribsys_id", "flag", "title_key", "lang", "similarity_ratio", "partial_ratio", "token_sort", "token_set"]

    count = 0
    with open(input_file) as infile, open(output_file, 'w') as outfile:
        reader = csv.reader(infile)
        next(reader, None)  # skip the headers
        writer = csv.DictWriter(outfile, fieldnames=csv_columns)
        writer.writeheader()

        for fields in reader:
            count += 1
            if len(fields) > 0:
                # left padding 0s to CID
                cid = ("000000000" + fields[0])[-9:]
                #print (cid)
                results = find_zephir_titles_by_cid(db_connect_str, cid)
                first_item = True
                for result in results:
                    if first_item:
                        title_key = result["title_key"]
                        first_item = False
                        result_base = {
                            "cid": result["cid"],
                            "contribsys_id": result["contribsys_id"],
                            "flag": "B",
                            "title_key" : result["title_key"],
                            "lang" : result["lang"].decode() if result["lang"] else "",
                        }
                    else:
                        ratios = FuzzyRatios(title_key, result["title_key"])
                        #print (ratios.fuzzy_ratio)
                        result_pair = {
                            "cid": result["cid"],
                            "contribsys_id": result["contribsys_id"],
                            "flag": "",
                            "title_key" : result["title_key"],
                            "lang" : result["lang"].decode() if result["lang"] else "",
                            "similarity_ratio": ratios.fuzzy_ratio,
                            "partial_ratio": ratios.fuzzy_partial_ratio,
                            "token_sort": ratios.fuzzy_token_sort_ratio,
                            "token_set": ratios.fuzzy_token_set_ratio,
                        }
                        if (ratios.fuzzy_ratio >= 30 and ratios.fuzzy_ratio <= 70):
                            writer.writerow(result_base)
                            writer.writerow(result_pair)