示例#1
0
def test_pickle_it():
    pickle_path = os.path.join(MY_PATH, 'output/some_data.pickle')
    if os.path.exists(pickle_path):
        os.remove(pickle_path)
    util.pickle_it(some_data, pickle_path)
    x = util.unpickle_it(pickle_path)
    assert x[0] == some_data[0]
示例#2
0
def get_links(pickle_file):
    feed_data = util.unpickle_it(pickle_file)
    results = []
    for r in feed_data:
        r_bibid = r['cul_metadata']['bibid']
        r_id = r['identifier']
        r_href = base_url + r_id
        r_label = r['cul_metadata']['label']
        results.append([r_bibid, r_href, r_label])
    return results
示例#3
0
def springer_build_datastore(
        feed_stem,
        collection_title,
        subject_path,
        output_dir,
        query='onlinedatefrom:2001-01-01%20onlinedateto:2021-07-31'):
    """Capture and store API data, merging with CUL-specific metadata.

    Args:
        feed_stem (str): short string used in file naming (no spaces)
        collection_title (str): Display title of the feed
        subject_path (str): Path to local subject lookup table
        output_dir (str): Path to local ouput directory
        query (str, optional): Boundary parameters for API request. Defaults to 'onlinedatefrom:2001-01-01%20onlinedateto:2021-07-31'.

    Returns:
        None: Saves to file; does not return data
    """
    out_file = feed_stem + ".json"
    url = "https://ebooks-test.library.columbia.edu/static-feeds/springer/" + \
        out_file

    data_store = os.path.join(output_dir, feed_stem + '_datastore.json')

    subject_data = util.unpickle_it(subject_path)

    x = get_springer_batch(q=query)

    print("Retrieved " + str(len(x)) + " books.")
    for r in x:
        # TODO: look up bibids
        r['cul_metadata'] = {
            'bibid': 'XXXXXX',
            'feed_id': feed_stem,
            'collection_name': collection_title,
            'retrieved': NOW
        }
        doi = r['doi']
        try:
            r['subjects'] = subject_data[doi]
        except KeyError:
            print("Warning: no subjects found for " + str(r['identifier']))

    if os.path.exists(data_store):
        x = springer_merge_records(x, data_store)
        print("Saving " + str(len(x)) + " records to " + str(data_store) +
              "...")
        with open(data_store, "w") as f:
            json.dump(x, f)
        return "Update of " + str(data_store) + " complete."
    else:
        print("Saving to " + str(data_store) + "...")
        with open(data_store, "w") as f:
            json.dump(x, f)
示例#4
0
def ids_from_pickle(pickle_path):
    """Return list of identifiers from pickled IA collection. For troubleshooting purposes.

    Args:
        pickle_path (str): path to pickle file

    Returns:
        list: list of ids
    """
    x = util.unpickle_it(pickle_path)
    return [r['identifier'] for r in x['data']]
示例#5
0
def get_bibs_and_ids(pickle_path):
    data = util.unpickle_it(pickle_path)['data']
    output = []
    for r in data:
        for e in r['metadata']:
            if e['key'] == 'dc.identifier.uri':
                the_uri = e['value']
                the_id = the_uri.split('/')[-1]
        output.append({
            'collection': r['cul_metadata']['collection_name'],
            'bibid': r['cul_metadata']['bibid'],
            'id': the_id
        })

    return output
示例#6
0
def build_feed(pickle_path,
               collection_abbr,
               chunk_size=100,
               output_dir='output/ia'):
    """Saves OPDS output to XML file(s). Returns error data (missing elements, etc. to be sent to report datasheet.

    Args:
        pickle_path (str): path to file
        collection_abbr (str): abbreviation of collection
        chunk_size (int, optional): Number of records per page. Defaults to 100.
        output_dir (str, optional): Path to output folder. Defaults to 'output/ia'.

    Returns:
        list: Errors/warnings encountered.
    """
    # Saves output to XML file(s). Returns error data (missing elements, etc.)
    # to be sent to report datasheet.
    global clio_string
    clio_string = "Go to catalog record in CLIO."
    global now
    # now = datetime.today().isoformat()  # Current timestamp in ISO
    now = datetime.utcnow().strftime(
        "%Y-%m-%dT%H:%M:%S.%fZ")  # Current timestamp in ISO
    base_url = "https://ebooks.library.columbia.edu/static-feeds/ia/" + collection_abbr + "/"
    base_folder = output_dir + '/' + collection_abbr + '/'

    # Unpack the data
    the_records = util.unpickle_it(pickle_path)['data']

    # some collection-level info to use
    feed_stem = the_records[0]['cul_metadata']['feed_id']
    collection_title = the_records[0]['cul_metadata']['collection_name']

    # Divide list into chunks
    total_count = len(the_records)
    print('Total count: ' + str(total_count))
    running_count = 0
    the_chunks = divide_list(the_records, chunk_size)

    for idx, record_chunk in enumerate(the_chunks):

        report_data = []

        running_count += len(record_chunk)
        print('Running_count = ' + str(running_count))
        print('')
        page_no = idx + 1
        if page_no > 1:
            feed_name = feed_stem + '_p' + str(page_no) + '.xml'
        else:
            feed_name = feed_stem + '.xml'

        feed_next_name = feed_stem + '_p' + str(page_no + 1) + '.xml'

        # Set up root and top-level elements
        root = etree.Element("feed", nsmap=NSMAP)
        feed_id = etree.SubElement(root, "id")
        feed_id.text = base_url + feed_stem
        feed_title = etree.SubElement(root, "title")
        # feed_title.text = collection_title + " | Columbia University Libraries"
        feed_title.text = collection_title
        feed_updated = etree.SubElement(root, "updated")
        feed_updated.text = now

        feed_link = etree.SubElement(root,
                                     "link",
                                     href=base_url + feed_name,
                                     rel="self")

        # Add feed_next, only if it is not the last one
        if running_count < total_count:
            feed_link_next = etree.SubElement(root,
                                              "link",
                                              href=base_url + feed_next_name,
                                              rel="next",
                                              title="Next")

        for record in record_chunk:
            bibid = record['cul_metadata']['bibid']

            e = make_entry(root, record, bibid)
            if e:  # if there are errors emanating from entry, add them to dict.
                # pprint(e)
                error_report = [
                    str(now), collection_title, feed_name, bibid,
                    record['identifier'], '; '.join(e)
                ]
                report_data.append(error_report)

        # Save result xml tree
        with open(base_folder + feed_name, 'wb') as f:
            f.write(etree.tostring(root, pretty_print=True))

    return (report_data)