示例#1
0
def test_pickle_it():
    pickle_path = os.path.join(MY_PATH, 'output/some_data.pickle')
    if os.path.exists(pickle_path):
        os.remove(pickle_path)
    util.pickle_it(some_data, pickle_path)
    x = util.unpickle_it(pickle_path)
    assert x[0] == some_data[0]
示例#2
0
def main():
    """Get IA collection specific to serial (Columbia Library Columns).
    """
    sheet_id = '1yTDyd5GQFEsVBiKOnt5T1ejBdXhxhmXVUn6jQ-dg_5I'
    sheet_tab = 'ColumbiaColumns'
    the_in_sheet = dataSheet(sheet_id, sheet_tab + '!A:Z')
    the_out_sheet = dataSheet(sheet_id, 'extract-errors!A:Z')
    the_err_sheet = dataSheet(sheet_id, 'errors!A:Z')

    output_folder = 'output/ia/'
    feed_stem = 'ia_clc_feed'
    collection_title = 'Columbia Library Columns'
    abbr = 'clc'

    pickle_path = output_folder + feed_stem + '.pickle'

    the_input = the_in_sheet.getData()
    heads = the_input.pop(0)

    the_records = [{'bibid': r[0], 'id':r[2], 'label': r[3]}
                   for r in the_input]

    feed_data = ia.extract_data(the_records, feed_stem, collection_title)

    feed_data_new = {'errors': feed_data['errors'], 'data': []}
    for e in feed_data['data']:
        new_entry = e

        des = new_entry['description']
        des_new = []
        for d in des:
            if '<a' not in d:
                des_new.append(d)
        new_entry['description'] = des_new
        feed_data_new['data'].append(new_entry)

    # pprint(feed_data_new)

    # Save to pickle.
    print('Saving ' + str(len(feed_data_new['data'])
                          ) + ' records to ' + pickle_path)
    # util.pickle_it(feed_data_new['data'], pickle_path)
    util.pickle_it(feed_data_new, pickle_path)

    # Report any extraction errors

    the_out_sheet.appendData(feed_data['errors'])

    # Generate XML

    x = ia.build_feed(output_folder + feed_stem + '.pickle', abbr)

    # report any build errors/warnings
    the_err_sheet.appendData(x)
示例#3
0
def get_collection(sheet_id,
                   sheet_tab,
                   feed_stem,
                   collection_title,
                   multipart=False):
    """Get Internet Archive collection and save to pickle.

    Args:
        sheet_id (str): Google sheet id
        sheet_tab (str): Google sheet tab name
        feed_stem (str): abbreviation to be used in file naming and feed identification
        collection_title (str): Title of collection (e.g., Medical Heritage Library)
        multipart (bool, optional): Incl/exclude multi-volume works. Defaults to False.
    """
    the_in_sheet = dataSheet(sheet_id, sheet_tab + '!A:Z')
    the_out_sheet = dataSheet(sheet_id, 'extract-errors!A:Z')

    pickle_path = OUT_PATH + feed_stem + '.pickle'

    # get a list of bibids and ia ids to process
    the_inputs = the_in_sheet.getData()
    the_inputs.pop(0)  # remove head row
    print(str(len(the_inputs)) + ' records in ' + collection_title + '...')
    the_records = []
    for i in the_inputs:

        # the_920s = i[6:]  # get arbitrary number of 920s for this row
        the_920s = i[4].split(';')  # get arbitrary number of 920s for this row
        rl = []
        for r in the_920s:
            if 'archive.org' in r:
                rp = ia.parse_920(r)
                # Only add if id != None.
                if bool(rp['id']):
                    rl.append({
                        'bibid': i[0],
                        'id': rp['id'],
                        'label': rp['label']
                    })

        # If we are allowing multi-volume works, add all;
        # otherwise, only add to list if it is a monograph.
        if len(rl) == 1 or multipart is True:
            the_records += rl

    feed_data = ia.extract_data(the_records, feed_stem, collection_title)

    print('Saving ' + str(len(feed_data['data'])) + ' records to ' +
          pickle_path)
    util.pickle_it(feed_data, pickle_path)

    the_out_sheet.appendData(feed_data['errors'])
示例#4
0
def get_collection(sheet_id,
                   sheet_tab,
                   feed_stem,
                   collection_title,
                   multipart=False):

    the_in_sheet = dataSheet(sheet_id, sheet_tab + '!A:Z')
    the_out_sheet = dataSheet(sheet_id, 'extract-errors!A:Z')

    pickle_path = output_dir + '/' + feed_stem + '.pickle'

    # get a list of bibids and ia ids to process
    the_inputs = the_in_sheet.getData()
    the_inputs.pop(0)  # remove head row
    the_records = []
    for i in the_inputs:
        the_920s = i[4].split(';')  # get arbitrary number of 920s for this row
        rl = []
        for r in the_920s:
            # if 'oapen.org/record' in r:
            if 'library.oapen.org/handle/20.500.12657/' in r:
                rp = parse_920(r)
                rl.append({
                    'bibid': i[0],
                    'id': rp['id'],
                    'label': rp['label']
                })

        # If we are allowing multi-volume works, add all;
        # otherwise, only add to list if it is a monograph.
        if len(rl) == 1 or multipart is True:
            the_records += rl
        elif len(rl) > 1:
            print("WARNING: " + str(i[0]) + " has multiple volumes. Skipping!")
        else:
            print("WARNING: could not find OAPEN record in " + str(i[0]) +
                  ". Skipping!")

    feed_data = extract_data(the_records, feed_stem, collection_title)

    print('Saving ' + str(len(feed_data['data'])) + ' records to ' +
          pickle_path)
    util.pickle_it(feed_data, pickle_path)

    # print(feed_data['data'])
    pprint(feed_data['errors'])

    the_out_sheet.appendData(feed_data['errors'])
示例#5
0
def main():
    the_sheet = dataSheet('1D2E5Sm3qZdU3MGXk7q2XxfBpQS1iqauQm19f_y9aTbM',
                          'Sheet1!A:Z')

    out_path = os.path.join(MY_PATH,
                            'output_test/springer/springer_subjects.pickle')
    subject_data = get_subjects(the_sheet)

    print(pickle_it(subject_data, out_path))

    # pprint(subject_data)

    quit()
示例#6
0
def get_linglong():
    """Get the linglong data from IA and save in one pickle per year (vol).
    """
    the_sheet = dataSheet(SHEET_ID, 'LingLong!A:Z')

    the_input = the_sheet.getData()
    heads = the_input.pop(0)

    the_data = []

    for y in range(1931, 1938):
        the_data.append({
            'vol':
            y,
            'items': [{
                'bibid': r[0],
                'id': r[2],
                'label': r[3]
            } for r in the_input if r[1] == str(y)]
        })

    # pprint(the_data)

    for vol_data in the_data:
        print(' ')
        print(vol_data['vol'])
        feed_stem = 'ia_ll_' + str(vol_data['vol'])
        pickle_path = OUTPUT_FOLDER + '/' + feed_stem + '.pickle'
        # print(vol_data['items'])
        feed_data = ia.extract_data(vol_data['items'], feed_stem,
                                    'Ling Long (' + str(vol_data['vol']) + ')')

        pprint(feed_data['errors'])

        print('Saving ' + str(len(feed_data['data'])) + ' records to ' +
              pickle_path)

        util.pickle_it(feed_data, pickle_path)
示例#7
0
# Script to harvest oapen data and transform to OPDS.
# TODO: everything!

import requests
import json
from pprint import pprint
import dcps_utils as util


the_collections = [
    {"name": "ERC", "url": "http://library.oapen.org/rest/search?query=oapen.collection:%22European%20Research%20Council%22&expand=metadata,bitstreams&limit=1000"}]

for c in the_collections:
    collection_data = json.loads(requests.get(c["url"]).text)

    util.pickle_it(collection_data, 'output/' +
                   "oapen_" + c["name"] + "_data.pickle")