Пример #1
0
def main():

    sheet_id = "1GEeNpKBhfjOCJGx1zJfi6XgZ4OWhGhzWsOHRT9DkmpY"

    # list_sheet = dataSheet(sheet_id, 'Test!A:Z')  # test
    list_sheet = dataSheet(sheet_id, "batch!A:Z")
    report_sheet = dataSheet(sheet_id, "output!A:Z")

    the_uris = list_sheet.getDataColumns()[0]

    output_data = []
    for uri in the_uris:
        asid = uri.split("/")[3]
        x = fix_agent(asid, "families")
        pprint(x["display_name"])
        res = asf.postAgent(asid, json.dumps(x), agent_type="families")
        print(res)
        row = [SERVER, uri, str(res)]
        output_data.append(row)

    print(output_data)

    report_sheet.appendData(output_data)

    quit()
Пример #2
0
def main():

    my_name = __file__

    # This makes sure the script can be run from any working directory and still find related files.
    my_path = os.path.dirname(__file__)

    sheet_id = '1tYOXSDFlkbX_revB_ULvhmCdvKkyzpipBTkYqYXcM38'

    bibids_sheet = dataSheet(sheet_id, 'MARC-exported!A:Z')
    holding_counts_sheet = dataSheet(sheet_id, 'holding_counts!A:Z')

    the_bibids = [r[0] for r in bibids_sheet.getData()]

    ### MARC ###

    # Read the MARC

    the_heads = ['bibid', 'holdings_count']
    the_rows = [the_heads]

    for abib in the_bibids:
        print('Getting MARC for ' + str(abib))

        marc_path = os.path.join(my_path, 'output/marc/' + str(abib) + '.marc')

        if os.path.exists(marc_path):

            try:
                with open(marc_path, 'rb') as fh:
                    reader = MARCReader(fh)
                    for record in reader:
                        # Find out if there is more than one holding; if there is, we cannot use it to automatically match top containers by name and will skip.
                        the_852s = record.get_fields('852')
                        count_852s = len(the_852s)

            except Exception as error:
                count_852s = 'ERROR: ' + str(error)
            the_rows.append([abib, str(count_852s)])

        else:
            print("Could not find " + marc_path + "... skipping...")

    # print(the_rows)
    holding_counts_sheet.clear()
    x = holding_counts_sheet.appendData(the_rows)
    print(x)

    quit()
    # Write results to google sheet

    marc_sheet.clear()
    x = marc_sheet.appendData(the_rows)
    print(x)

    quit()
Пример #3
0
def main():
    # SERVER = "Test"  # test
    SERVER = "Prod"
    asf.setServer(SERVER)

    LOOKUP = '/Users/dwh2128/Documents/git/dcps-utils/archivesspace/as_reports/id_lookup_prod.csv'

    sheet_id = '1Jbdhda0HbmHKJ7COOJ3CBzdMwpSeIbYHyXzr179ETpI'
    read_sheet = dataSheet(sheet_id, 'TEST!A:Z')  # Test
    write_sheet = dataSheet(sheet_id, 'Output!A:Z')

    the_data = read_sheet.getData()
    the_data.pop(0)

    # print(the_refs)

    the_output = []
    for r in the_data:
        bibid = r[0]
        repo = r[1]
        ref = r[2]
        extref_old = r[3]
        extref_new = r[5]
        the_res = json.loads(asf.getResourceByBibID(bibid, LOOKUP))
        # pprint(the_res)

        asid = the_res['uri'].split('/')[4]

        print("repo: " + str(repo) + "; asid: " + str(asid))

        the_notes = json.dumps(the_res['notes'])
        # print(the_notes)
        print(" ")

        the_new_notes = replace_notes(
            the_notes, [
                # fix problem of leading space in href
                {'find': 'xlink:href=\\" http',
                 'replace': 'xlink:href=\\"http'},
                # replace old url with new one
                {'find': extref_old,
                 'replace': extref_new}])

        # print(the_new_notes)

        the_res['notes'] = json.loads(the_new_notes)

        x = asf.postResource(repo, asid, json.dumps(the_res))
        out_row = [SERVER, repo, asid, ref, extref_old, extref_new, str(x)]
        print(out_row)
        the_output.append(out_row)

    # # write_sheet.clear()
    write_sheet.appendData(the_output)
    quit()
Пример #4
0
def main():

    # Test sheet with sample data
    sheet_id = '19zHqOJt9XUGfrfzAXzOcr4uARgCGbyYiOtoaOCAMP7s'
    sheet_range = 'Sheet1!A:Z'

    # Data from sheetFeeder dataSheet
    print("1. dataSheet data array:")
    ds = dataSheet(sheet_id, sheet_range).getData()
    print(ds)

    print("")

    # ds to df example
    df = datasheet_to_dataframe(sheet_id, sheet_range)
    print("2. Converted to DataFrame:")
    print(df)
    print("")
    print("DataFrame shape:")
    print(df.shape)
    print("")
    print("Data types:")
    print(df.dtypes)
    print("")
    print("Column averages:")
    print(df.mean())

    df['mean'] = df.mean(numeric_only=True, axis=1)
    print(df)
    df.assign(mean_a=df.a.mean(), mean_b=df.b.mean())

    # ds = dataframe_to_datasheet(df)
    # # print(ds)
    # dataSheet(sheet_id, sheet_range).appendData(ds)
    quit()

    print("")

    # df back to ds
    ds = dataframe_to_datasheet(df)
    print("3. Converted back to dataSheet array:")
    print(ds)

    print("")

    # Get sheetFeeder data as series, and convert to Pandas df
    ds = dataSheet(sheet_id, sheet_range)
    ds_series = ds.getDataSeries()
    print("4. Data as series:")
    print(ds_series)
    print("")

    df = pd.DataFrame(ds_series)
    print("5. Series converted to dataframe:")
    print(df)
Пример #5
0
def main():
    """Get IA collection specific to serial (Columbia Library Columns).
    """
    sheet_id = '1yTDyd5GQFEsVBiKOnt5T1ejBdXhxhmXVUn6jQ-dg_5I'
    sheet_tab = 'ColumbiaColumns'
    the_in_sheet = dataSheet(sheet_id, sheet_tab + '!A:Z')
    the_out_sheet = dataSheet(sheet_id, 'extract-errors!A:Z')
    the_err_sheet = dataSheet(sheet_id, 'errors!A:Z')

    output_folder = 'output/ia/'
    feed_stem = 'ia_clc_feed'
    collection_title = 'Columbia Library Columns'
    abbr = 'clc'

    pickle_path = output_folder + feed_stem + '.pickle'

    the_input = the_in_sheet.getData()
    heads = the_input.pop(0)

    the_records = [{'bibid': r[0], 'id':r[2], 'label': r[3]}
                   for r in the_input]

    feed_data = ia.extract_data(the_records, feed_stem, collection_title)

    feed_data_new = {'errors': feed_data['errors'], 'data': []}
    for e in feed_data['data']:
        new_entry = e

        des = new_entry['description']
        des_new = []
        for d in des:
            if '<a' not in d:
                des_new.append(d)
        new_entry['description'] = des_new
        feed_data_new['data'].append(new_entry)

    # pprint(feed_data_new)

    # Save to pickle.
    print('Saving ' + str(len(feed_data_new['data'])
                          ) + ' records to ' + pickle_path)
    # util.pickle_it(feed_data_new['data'], pickle_path)
    util.pickle_it(feed_data_new, pickle_path)

    # Report any extraction errors

    the_out_sheet.appendData(feed_data['errors'])

    # Generate XML

    x = ia.build_feed(output_folder + feed_stem + '.pickle', abbr)

    # report any build errors/warnings
    the_err_sheet.appendData(x)
Пример #6
0
def get_collection(sheet_id,
                   sheet_tab,
                   feed_stem,
                   collection_title,
                   multipart=False):
    """Get Internet Archive collection and save to pickle.

    Args:
        sheet_id (str): Google sheet id
        sheet_tab (str): Google sheet tab name
        feed_stem (str): abbreviation to be used in file naming and feed identification
        collection_title (str): Title of collection (e.g., Medical Heritage Library)
        multipart (bool, optional): Incl/exclude multi-volume works. Defaults to False.
    """
    the_in_sheet = dataSheet(sheet_id, sheet_tab + '!A:Z')
    the_out_sheet = dataSheet(sheet_id, 'extract-errors!A:Z')

    pickle_path = OUT_PATH + feed_stem + '.pickle'

    # get a list of bibids and ia ids to process
    the_inputs = the_in_sheet.getData()
    the_inputs.pop(0)  # remove head row
    print(str(len(the_inputs)) + ' records in ' + collection_title + '...')
    the_records = []
    for i in the_inputs:

        # the_920s = i[6:]  # get arbitrary number of 920s for this row
        the_920s = i[4].split(';')  # get arbitrary number of 920s for this row
        rl = []
        for r in the_920s:
            if 'archive.org' in r:
                rp = ia.parse_920(r)
                # Only add if id != None.
                if bool(rp['id']):
                    rl.append({
                        'bibid': i[0],
                        'id': rp['id'],
                        'label': rp['label']
                    })

        # If we are allowing multi-volume works, add all;
        # otherwise, only add to list if it is a monograph.
        if len(rl) == 1 or multipart is True:
            the_records += rl

    feed_data = ia.extract_data(the_records, feed_stem, collection_title)

    print('Saving ' + str(len(feed_data['data'])) + ' records to ' +
          pickle_path)
    util.pickle_it(feed_data, pickle_path)

    the_out_sheet.appendData(feed_data['errors'])
Пример #7
0
def main():

    sheet_id = '1yTDyd5GQFEsVBiKOnt5T1ejBdXhxhmXVUn6jQ-dg_5I'
    # the_voyager_range = 'Durst!A:Z'
    the_voyager_range = 'MWM!A:Z'
    the_ingest_range = 'ingested!A:Z'
    the_output_range = 'test!A:Z'

    the_data = dataSheet(sheet_id, the_voyager_range).getData()
    the_ingested = [
        x[0] for x in dataSheet(sheet_id, the_ingest_range).getData()
    ]  # list of IA ids that are in the IA collection.

    the_output_sheet = dataSheet(sheet_id, the_output_range)

    the_heads = ['bibid', 'id', 'label', 'url', 'composed', 'in collection?']
    the_new_data = [the_heads]

    for a_row in the_data:

        bibid = a_row[0]
        the_920 = a_row[4]
        if the_920:
            # print(parse_920(the_920))
            parsed = parse_920(the_920)

            for d in parsed:
                if 'archive.org' in d['url']:

                    # id = d['url'].split('/')[-1]
                    id = (d['url'][:-1] if d['url'].endswith('/') else
                          d['url']).split('/')[-1]
                    match_flag = "Y" if id in the_ingested else "N"
                    url = 'https://ebooksbeta.lyrasistechnology.org/columbia/book/URI%2Furn%3Ax-internet-archive%3Aebooks-app%3Aitem%3A' + id
                    if 'label' in d:
                        label = 'Read on mobile (' + d['label'] + ')'
                    else:
                        label = "Read on mobile"
                    composed = '$3' + label + '$u' + url
                    the_new_data.append(
                        [bibid, id, label, url, composed, match_flag])

    print(the_new_data)

    the_output_sheet.clear()

    post = the_output_sheet.appendData(the_new_data)
    print(post)
Пример #8
0
def get_collection(sheet_id,
                   sheet_tab,
                   feed_stem,
                   collection_title,
                   multipart=False):

    the_in_sheet = dataSheet(sheet_id, sheet_tab + '!A:Z')
    the_out_sheet = dataSheet(sheet_id, 'extract-errors!A:Z')

    pickle_path = output_dir + '/' + feed_stem + '.pickle'

    # get a list of bibids and ia ids to process
    the_inputs = the_in_sheet.getData()
    the_inputs.pop(0)  # remove head row
    the_records = []
    for i in the_inputs:
        the_920s = i[4].split(';')  # get arbitrary number of 920s for this row
        rl = []
        for r in the_920s:
            # if 'oapen.org/record' in r:
            if 'library.oapen.org/handle/20.500.12657/' in r:
                rp = parse_920(r)
                rl.append({
                    'bibid': i[0],
                    'id': rp['id'],
                    'label': rp['label']
                })

        # If we are allowing multi-volume works, add all;
        # otherwise, only add to list if it is a monograph.
        if len(rl) == 1 or multipart is True:
            the_records += rl
        elif len(rl) > 1:
            print("WARNING: " + str(i[0]) + " has multiple volumes. Skipping!")
        else:
            print("WARNING: could not find OAPEN record in " + str(i[0]) +
                  ". Skipping!")

    feed_data = extract_data(the_records, feed_stem, collection_title)

    print('Saving ' + str(len(feed_data['data'])) + ' records to ' +
          pickle_path)
    util.pickle_it(feed_data, pickle_path)

    # print(feed_data['data'])
    pprint(feed_data['errors'])

    the_out_sheet.appendData(feed_data['errors'])
Пример #9
0
def main():

    # x = util.unpickle_it('output/oapen/oapen_clio.pickle')

    # from pprint import pprint
    # pprint(x[1])

    report_metadata('1kLI8x1whzSNqeKL5xVysopgKYWE9-D9H_PHX2RkW4wQ',
                    'output!A:Z', 'output/oapen/oapen_clio.pickle')

    quit()
    out_sheet = dataSheet('1OG0UgqHCdAzx326JNy7akx9-MOwR9A_MSf-MEv9k3Ms',
                          'OAPEN!A:Z')

    the_pickles = ['output/oapen/oapen_clio.pickle']

    item_url_base = 'https://ebooks.lyrasistechnology.org/190150/book/https%3A%2F%2Fcolumbia.lyrasistechnology.org%2F190150%2Fworks%2FURI%2Fhttp%3A%2F%2Flibrary.oapen.org%2Fhandle%2F20.500.12657%2F'

    the_output = [['COLL', 'ID', 'BIBID', 'HREF']]

    # Add rows for items within each collection
    for p in the_pickles:
        the_output += [[
            r['collection'], r['id'], r['bibid'], item_url_base + r['id']
        ] for r in get_bibs_and_ids(p)]

    out_sheet.clear()
    out_sheet.appendData(the_output)

    quit()
Пример #10
0
def main():

    asf.setServer('Prod')

    now1 = datetime.now()
    start_time = str(now1)
    end_time = ''  #set later
    # today_str = str(date.today().strftime("%Y%m%d"))
    yest_str = str((date.today() - timedelta(days=1)).strftime("%Y-%m-%d"))

    sheet_id = '198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY'
    data_data = [{
        'range': 'resource-changes!A:Z',
        'filter': 'resources'
    }, {
        'range': 'accession-changes!A:Z',
        'filter': 'accessions'
    }]

    for d in data_data:

        print('processing ' + d['filter'])

        the_sheet = dataSheet(sheet_id, d['range'])

        the_date = yest_str
        # the_date = '2019-08-27'
        the_repos = [2, 3, 4, 5]
        the_fields = [
            'id', 'title', 'identifier', 'create_time', 'system_mtime',
            'last_modified_by', 'publish'
        ]

        the_modifieds = []

        for r in the_repos:

            print('searching repo ' + str(r))

            x = asf.getByDate(r,
                              the_date,
                              date_type='mtime',
                              comparator='equal',
                              filter=d['filter'],
                              fields=the_fields)
            for a in x:
                row = [a[v] for v in the_fields]
                print(row)
                the_modifieds.append(row)
                # print(list(a.values()))
                # the_modifieds.append(list(a.values()))
            print('Repo ' + str(r) + ': ' + str(len(x)))

        print('Total ' + d['filter'] + ': ' + str(len(the_modifieds)))
        # the_sheet.clear()

        # the_sheet.appendData([the_fields])
        the_sheet.appendData(the_modifieds)

    quit()
Пример #11
0
def main():
    """Script to compose OPDS v1.2 XML feeds using data saved from ia_get_collections.py.
    Modify list of collections as needed.
    """
    the_out_sheet = dataSheet(SHEET_ID, 'errors!A:Z')

    the_collections = [
        (OUTPUT_DIR + '/ia_avt_feed.pickle', 'avt'),
        (OUTPUT_DIR + '/ia_ccny_feed.pickle', 'ccny'),
        (OUTPUT_DIR + '/ia_durst_feed.pickle', 'durst'),
        (OUTPUT_DIR + '/ia_med_feed.pickle', 'med'),
        (OUTPUT_DIR + '/ia_mrp_feed.pickle', 'mrp'),
        (OUTPUT_DIR + '/ia_mwm_feed.pickle', 'mwm'),
        (OUTPUT_DIR + '/ia_wwi_feed.pickle', 'wwi'),
        (OUTPUT_DIR + '/ia_clc_feed.pickle', 'clc'),
        (OUTPUT_DIR + '/ia_hebrewmss_feed.pickle', 'hebrewmss'),
        # ('output/ia/ia_tibetan_feed.pickle', 'tibet'),
    ]

    for col in the_collections:
        x = ia.build_feed(col[0], col[1], output_dir=OUTPUT_DIR)
        the_out_sheet.appendData(x)

    build_linglong_feed(pickle_dir=OUTPUT_DIR, output_dir=OUTPUT_DIR)

    # validate the output
    x = opds_validate.validate_files(OUTPUT_DIR)
    print('\n'.join('***ERROR!*** File ' + r['file'] + ' has errors: ' +
                    r['errors'] for r in x if r['errors']))

    quit()
Пример #12
0
def main():

    the_sheet = dataSheet('183S8_aMD6py8XvVzIAE4WnDyUeCuLYsyacyNImX1frM',
                          'Sheet1!A:Z')

    in_file = os.path.join(
        MY_PATH, "output_test/springer/springer_test_feed_datastore.json")

    with open(in_file, "rb") as f:
        json_data = json.load(f)

    the_data = [[
        'Title', 'DOI', 'ISBN', 'Print ISBN', 'E-ISBN', 'Pub Date', 'PDF',
        'EPUB'
    ]]
    for b in json_data:
        title = b['publicationName']
        doi = b['doi']
        isbn = b['isbn']
        print_isbn = b['printIsbn']
        e_isbn = b['electronicIsbn']
        pub_date = b['publicationDate']

        link_info = get_type(b)

        row = [
            title, doi, isbn, print_isbn, e_isbn, pub_date,
            link_info['has_pdf'], link_info['has_epub']
        ]
        # print(get_type(b['links']))
        the_data.append(row)

    the_sheet.clear()
    the_sheet.appendData(the_data)
    quit()
Пример #13
0
def main():
    SERVER = "Prod"  # test
    # SERVER = "Prod"
    asf.setServer(SERVER)

    sheet_id = '1Jbdhda0HbmHKJ7COOJ3CBzdMwpSeIbYHyXzr179ETpI'
    read_sheet = dataSheet(sheet_id, 'TEST!A:Z')  # Test
    write_sheet = dataSheet(sheet_id, 'Output!A:Z')

    the_data = read_sheet.getData()
    the_data.pop(0)

    # print(the_refs)

    the_output = []
    for r in the_data:
        repo = r[1]
        ref = r[2]
        extref_old = r[3]
        extref_new = r[5]
        the_ao = json.loads(asf.getArchivalObjectByRef(repo, ref))
        asid = the_ao['uri'].split('/')[4]

        print("asid: " + str(asid))

        the_notes = json.dumps(the_ao['notes'])

        # fix problem of leading space in href
        the_new_notes = the_notes.replace('xlink:href=\\" http',
                                          'xlink:href=\\"http')
        # replace old url with new one
        the_new_notes = the_new_notes.replace(extref_old, extref_new)

        print(the_new_notes)
        the_ao['notes'] = json.loads(the_new_notes)

        pprint(the_ao)

        x = asf.postArchivalObject(repo, asid, json.dumps(the_ao))
        out_row = [SERVER, repo, asid, ref, extref_old, extref_new, str(x)]
        print(out_row)
        the_output.append(out_row)

    # write_sheet.clear()
    write_sheet.appendData(the_output)
    quit()
Пример #14
0
def main():
    the_sheet = dataSheet('1D2E5Sm3qZdU3MGXk7q2XxfBpQS1iqauQm19f_y9aTbM',
                          'Sheet1!A:Z')

    out_path = os.path.join(MY_PATH,
                            'output_test/springer/springer_subjects.pickle')
    subject_data = get_subjects(the_sheet)

    print(pickle_it(subject_data, out_path))

    # pprint(subject_data)

    quit()
Пример #15
0
def build_linglong_feed(pickle_dir=OUTPUT_FOLDER, output_dir=OUTPUT_FOLDER):
    """Run after data has been extracted via get_linglong.

    Args:
        pickle_dir (str, optional): Path to folder containing pickles. Defaults to OUTPUT_FOLDER.
        output_dir (str, optional): Path to output folder. Defaults to OUTPUT_FOLDER.
    """
    the_out_sheet = dataSheet(SHEET_ID, 'errors!A:Z')

    for y in range(1931, 1938):
        x = ia.build_feed(pickle_dir + '/ia_ll_' + str(y) + '.pickle',
                          'll',
                          output_dir=output_dir)
        the_out_sheet.appendData(x)
Пример #16
0
def get_collection(sheet_id,
                   sheet_tab,
                   feed_stem,
                   collection_title,
                   multipart=False):

    the_in_sheet = dataSheet(sheet_id, sheet_tab + '!A:Z')
    the_out_sheet = dataSheet(sheet_id, 'extract-errors!A:Z')

    # get a list of bibids and ia ids to process
    the_inputs = the_in_sheet.getData()
    the_inputs.pop(0)  # remove head row
    the_records = []
    for i in the_inputs:
        the_920s = i[4].split(';')  # get arbitrary number of 920s for this row
        rl = []
        for r in the_920s:
            if 'www.gutenberg.org/ebooks/' in r:
                rp = parse_920(r)
                rl.append({
                    'bibid': i[0],
                    'id': rp['id'],
                    'label': rp['label']
                })

        # If we are allowing multi-volume works, add all;
        # otherwise, only add to list if it is a monograph.
        if len(rl) == 1 or multipart is True:
            the_records += rl
        elif len(rl) > 1:
            print("WARNING: " + str(i[0]) + " has multiple volumes. Skipping!")
        else:
            print("WARNING: could not find OAPEN record in " + str(i[0]) +
                  ". Skipping!")

    # feed_data = extract_data(the_records, feed_stem, collection_title)
    return the_records
Пример #17
0
def main():
    # SERVER = "Test" # test
    SERVER = "Prod"
    asf.setServer(SERVER)

    sheet_id = '1OABHEJF1jqA1vlbW5yTENry5W7YqKlag5nJDJ9ouCzg'
    # read_sheet = dataSheet(sheet_id, 'Test!A:Z')  # Test
    read_sheet = dataSheet(sheet_id, 'Prod!A:Z')  # Test
    write_sheet = dataSheet(sheet_id, 'output!A:Z')

    the_refs = read_sheet.getDataColumns()[0]
    # print(the_refs)

    the_output = []
    for r in the_refs:
        the_ao = json.loads(asf.getArchivalObjectByRef(2, r))
        asid = the_ao['uri'].split('/')[4]
        old_date = str(the_ao['dates'][0]['begin'])
        new_ao = fix_begin_date(2, the_ao)
        new_date = str(new_ao['dates'][0]['begin'])
        print("asid: " + str(asid))
        x = asf.postArchivalObject(2, asid, json.dumps(new_ao))
        out_row = [SERVER, r, asid, old_date, new_date, str(x)]
        # print(out_row)
        the_output.append(out_row)

    write_sheet.clear()
    write_sheet.appendData(the_output)
    quit()

    x = fix_begin_date(2, 'b2ec9ce511e4212ebb145fb909ca85bd')
    print(x)

    pprint(
        json.loads(
            asf.getArchivalObjectByRef(2, 'b2ec9ce511e4212ebb145fb909ca85bd')))
    quit()
Пример #18
0
def main():

    sheet_id = '1_1d8aElm9yRG4Avy9j6WxTh2TjhMp8iqaeZkgUNdxeE'
    report_sheet = dataSheet(sheet_id, 'Test!A:Z')
    lookup_sheet = dataSheet(sheet_id, 'Lookup!A:Z')

    lookup_file = os.path.join(MY_PATH,
                               "output_test/proquest/proquest_lookup.json")

    x = get_lookup(lookup_sheet, lookup_file)

    print(x)
    quit()
    in_file = os.path.join(MY_PATH,
                           "output_test/proquest/ProQuest_BooksCatalog.json")

    with open(in_file, "rb") as f:
        json_data = json.load(f)

    the_books = json_data['opdsFeed']['groups'][0]['publications']

    the_data = [['Title', 'EBC ID', 'URL', 'PDF', 'EPUB']]
    for b in the_books:
        id = b['metadata']['identifier'].split('/')[-1]
        url = "https://ebookcentral.proquest.com/lib/columbia/detail.action?docID=" + \
            str(id)
        link_info = get_type(b['links'])
        row = [
            b['metadata']['title'], id, url, link_info['has_pdf'],
            link_info['has_epub']
        ]
        # print(get_type(b['links']))
        the_data.append(row)

    report_sheet.clear()
    report_sheet.appendData(the_data)
    quit()
Пример #19
0
def main():

    asf.setServer('Prod')

    # the_repos=[2,3,4,5]
    the_repos=[2]
    the_fields = ['id','title','identifier','create_time','system_mtime','last_modified_by','json']

    the_sheet=dataSheet('198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY','unpublished!A:Z')


    the_unpublished = []

    for r in the_repos:
        print('searching repo ' + str(r))
            
        x = getUnpublished(r,filter='resources',fields=the_fields)
        # print(x)

        for a in x:
            row = [ a[v] for v in the_fields ]
            my_json = json.loads(row.pop(6))
            try:
                call_no = my_json['user_defined']['string_1']
            except: 
                call_no = ''
            repo_id = int(str(row[0].split('/')[-3]).rstrip()) # get the repo from the uri string.
            asid = int(str(row[0].split('/')[-1]).rstrip()) # get the asid from the uri string.
            row.pop(0)
            row.insert(0,asid), row.insert(0,repo_id)
            if 'UA' in call_no:
                repo = 'nnc-ua'
            else:
                repo = get_repo(repo_id)
            row.insert(0,repo)
            the_unpublished.append(row)
            print(row)
        print('Repo ' + str(r) + ': ' + str(len(x)))

    print('Total unpublished: ' + str(len(the_unpublished)))

    # the_sheet.clear()
    # the_sheet.appendData([the_fields])
    # the_sheet.appendData(the_unpublished)


    quit()
Пример #20
0
def main():

    my_name = __file__
    script_name = os.path.basename(my_name)

    # This makes sure the script can be run from any working directory and still find related files.
    my_path = os.path.dirname(__file__)

    the_output_sheet = dataSheet(
        '1sjpjLt_I54h9l-ABwueYdN6xVAm01S6rKZB3BgMQv3k', 'output!A:Z')

    # The table of xlinks with format:
    # bibid|container_id|href|title|text
    aCSV = os.path.join(my_path, 'output/output_all_f.txt')

    with open(aCSV) as the_csv:
        the_data = [row for row in csv.reader(the_csv, delimiter='|')]

    the_heads = the_data.pop(0)
    the_heads += ['STATUS', 'REDIRECT_LOCATION', 'REDIRECT_STATUS']
    the_new_data = [the_heads]

    for a_row in the_data:
        print(a_row)
        if 'clio.columbia.edu' in a_row[2]:
            print('Skipping CLIO record ' + a_row[2])
        else:
            response = get_response(a_row[2])
            if response['status'] != 200:
                new_row = a_row
                while len(new_row) < 5:
                    new_row.append("")
                new_row.append(response['status'])
                if 'location' in response:
                    redirect_response = get_response(response['location'])
                    new_row += [
                        response['location'], redirect_response['status']
                    ]

                print(new_row)
                the_new_data.append(new_row)

    # Write output to sheet.
    the_output_sheet.clear()
    the_output_sheet.appendData(the_new_data)
Пример #21
0
def main():
    # TEST

    x = feed_parse(
        'https://www.gutenberg.org/ebooks/search.opds/?sort_order=downloads')

    pprint(x)

    quit()
    ###

    the_sheet = dataSheet('1SyErJ6LqNUzEoJ5LP14L9Ofkn63CUaDor4H_8cRFGgo',
                          'test!A:Z')

    a_url = 'https://ebooks-test.library.columbia.edu/feeds/ia/culcarnegiecorp/'
    # a_url = 'https://ebooks-test.library.columbia.edu/feeds/ia/cullinglong/'

    the_dicts = feed_parse(a_url)

    the_heads = ['BIBID', 'ID', 'Title', 'URL', 'Label']

    the_data = [[
        x['id'], x['title'],
        "https://ebooks.lyrasistechnology.org/columbia/book/URI%2F" +
        urllib.parse.quote(x['id'])
    ] for x in the_dicts]

    for row in the_data:
        try:
            row.insert(0, re.search('_(.+?)_', row[0]).group(1))
        except:
            # raise "Could not parse bibid"
            print("Could not parse bibid -- " + row[0])
            row.insert(0, "")

    the_data.insert(0, the_heads)

    the_sheet.clear()
    x = the_sheet.appendData(the_data)
    print(x)
Пример #22
0
def get_linglong():
    """Get the linglong data from IA and save in one pickle per year (vol).
    """
    the_sheet = dataSheet(SHEET_ID, 'LingLong!A:Z')

    the_input = the_sheet.getData()
    heads = the_input.pop(0)

    the_data = []

    for y in range(1931, 1938):
        the_data.append({
            'vol':
            y,
            'items': [{
                'bibid': r[0],
                'id': r[2],
                'label': r[3]
            } for r in the_input if r[1] == str(y)]
        })

    # pprint(the_data)

    for vol_data in the_data:
        print(' ')
        print(vol_data['vol'])
        feed_stem = 'ia_ll_' + str(vol_data['vol'])
        pickle_path = OUTPUT_FOLDER + '/' + feed_stem + '.pickle'
        # print(vol_data['items'])
        feed_data = ia.extract_data(vol_data['items'], feed_stem,
                                    'Ling Long (' + str(vol_data['vol']) + ')')

        pprint(feed_data['errors'])

        print('Saving ' + str(len(feed_data['data'])) + ' records to ' +
              pickle_path)

        util.pickle_it(feed_data, pickle_path)
def main():

    # Set path to Saxon processor
    saxon_path = 'saxon-9.8.0.12-he.jar'

    # Set path to XSLT
    the_xslt = 'ead_tbm_csv.xsl'
    the_infile = '/path/to/source/xmlfile_ead.xml'
    the_outpath = '/path/to/output_file.txt'

    # Set Google Sheet id and range
    the_sheet = dataSheet('LmguZqjAk23OPHeDmyy2wvZiXGaiLz7', 'Test!A:Z')

    # Parameters to pass to XSLT:
    params = {
        'series_scope': 7,  # series to process; 0 = all series
        'subject': 'AUDIO RECORDINGS'  # static label for content
    }

    # generate a parameter string from params
    param_str = ''
    for key, value in params.items():
        value = str(value).replace(' ', '\ ')
        param_str += str(key) + '=' + str(value) + ' '

    # Send to Saxon with parameters
    saxon_process(saxon_path,
                  the_infile,
                  the_xslt,
                  the_outpath,
                  theParams=param_str)

    # Send result csv to Google Sheet
    y = the_sheet.importCSV(the_outpath, delim='|')

    print(y)
Пример #24
0
def main():

    sheet_id = '1X4e52glzKJjRpiOb7S6b4bOzyhTfjHCVlfjPosdABKM'

    the_info = [
        {
            'name': 'AveryTrade',
            'file': 'output/ia/ia_avt_feed.pickle'
        },
        {
            'name': 'Carnegie',
            'file': 'output/ia/ia_ccny_feed.pickle'
        },
        {
            'name': 'Durst',
            'file': 'output/ia/ia_durst_feed.pickle'
        },
        {
            'name': 'MedicalHeritage',
            'file': 'output/ia/ia_med_feed.pickle'
        },
        {
            'name': 'MissionaryResearch',
            'file': 'output/ia/ia_mrp_feed.pickle'
        },
        {
            'name': 'MuslimWorld',
            'file': 'output/ia/ia_mwm_feed.pickle'
        },
        {
            'name': 'WWI',
            'file': 'output/ia/ia_wwi_feed.pickle'
        },
    ]

    the_out_sheet = dataSheet(sheet_id, 'Combined!A:Z')
    the_out_sheet.clear()

    the_heads = ['collection', 'bibid', 'href', 'label']
    the_data = [the_heads]
    for i in the_info:
        the_links = get_links(i['file'])
        the_data += [[i['name'], r[0], r[1], r[2]] for r in the_links]

    post = the_out_sheet.appendData(the_data)
    print(post)

    quit()
    # post = the_out_sheet.appendData(get_links('output/ia/ia_ll_1931.pickle'))
    # print(post)
    # post = the_out_sheet.appendData(get_links('output/ia/ia_ll_1932.pickle'))
    # print(post)
    # post = the_out_sheet.appendData(get_links('output/ia/ia_ll_1933.pickle'))
    # print(post)
    # post = the_out_sheet.appendData(get_links('output/ia/ia_ll_1934.pickle'))
    # print(post)
    # post = the_out_sheet.appendData(get_links('output/ia/ia_ll_1935.pickle'))
    # print(post)
    # post = the_out_sheet.appendData(get_links('output/ia/ia_ll_1936.pickle'))
    # print(post)
    # post = the_out_sheet.appendData(get_links('output/ia/ia_ll_1937.pickle'))
    # print(post)

    quit()
Пример #25
0
from sheetFeeder import dataSheet
import pandas as pd
import pandas_functions as pf

sheet_id = '19zHqOJt9XUGfrfzAXzOcr4uARgCGbyYiOtoaOCAMP7s'
sheet_range = 'Pandas!A:Z'
the_sheet = dataSheet(sheet_id, sheet_range)

# Put some sample data in the sheet
data1 = [['Col A', 'Col B', 'Col C', 'Col D'], [1.0, 2.0, 3.0, 4.0],
         [5.0, 6.0, 7.0, 8.0], [0.0, 11.5, -5.0, 9.25]]

the_sheet.clear()
the_sheet.appendData(data1)

# 1. Get the data from the sheet (this is redundant, but just for demo purposes!)
ds = the_sheet.getData()

# 2. Convert to Pandas DataFrame
df = pf.datasheet_to_dataframe(sheet_id, sheet_range)

print(df)

print("")

# Add a column calculating the averages of each row
df['Average'] = df.mean(numeric_only=True, axis=1)
print(df)

# Convert back into array
data2 = pf.dataframe_to_datasheet(df)
Пример #26
0
def main():

    # This makes sure the script can be run from any working directory and still find related files.
    my_path = os.path.dirname(__file__)

    if DEBUG is True:
        sheet_id = "18uvn9wIABHVIdjlSRNXqnHUKB2aTvZgKO62e-UFNuO8"  # test
    else:
        sheet_id = "1dTeMAK_cGWAUvrqvAiY2hGy4gJewrmWjnuIZu8NhWwE"

    now1 = datetime.datetime.now()
    start_time = str(now1)
    end_time = ""  # set later

    # First get the agent records from API (this can take a long time!)

    asf.setServer("Prod")  # AS instance: Prod | Dev | Test

    if DEBUG is True:
        out_folder = "/cul/cul0/ldpd/archivesspace/test/agents"
    else:
        out_folder = "/cul/cul0/ldpd/archivesspace/agents"

    family_agents_file = os.path.join(out_folder, "agents_families.pickle")
    corp_agents_file = os.path.join(out_folder, "agents_corporate.pickle")
    persons_agents_file = os.path.join(out_folder, "agents_persons.pickle")

    the_info = [
        {
            "name": "families",
            "endpoint": "/agents/families",
            "sheet": dataSheet(sheet_id, "families!A:Z"),
            "pickle": family_agents_file
        },
        {
            "name": "corporate",
            "endpoint": "/agents/corporate_entities",
            "sheet": dataSheet(sheet_id, "corporate!A:Z"),
            "pickle": corp_agents_file
        },
        {
            "name": "persons",
            "endpoint": "/agents/people",
            "sheet": dataSheet(sheet_id, "persons!A:Z"),
            "pickle": persons_agents_file
        },
    ]

    # List of fields to extract, expressed as dpaths.
    the_fields = [
        ["uri", "uri"],
        ["title", "title"],
        ["source", "names/0/source"],
        ["authority_id", "names/0/authority_id"],
        ["is_linked_to_published_record", "is_linked_to_published_record"],
        ["publish", "publish"],
        ["last_modified_by", "last_modified_by"],
        ["last_modified", "system_mtime"],
    ]

    the_record_cnts = {}

    if DEBUG is True:
        print("*** (DEBUG MODE) ***")

    for i in the_info:
        print("Getting agents: " + i["name"])
        agent_data = get_agent_data(i["name"], i["endpoint"], i["pickle"])

        print(" ")

        # Report the saved data to Google Sheet

        the_sheet = i["sheet"]

        the_heads = [x[0] for x in the_fields]
        the_output = [the_heads]

        the_record_cnts[i["name"]] = str(len(agent_data))

        for agent in agent_data:
            the_row = []
            # Use dpath to extract values from dict and compose into rows.
            for af in the_fields:
                try:
                    d = str(dpath.util.get(agent, af[1]))
                except:
                    d = ""
                the_row.append(d)
            # print(the_row)
            the_output.append(the_row)

        the_sheet.clear()
        save = the_sheet.appendData(the_output)
        print(save)

    # Generate log

    print(the_record_cnts)
    print(" ".join(the_record_cnts))

    cnt_str = "".join(k + "=" + v + ". " for k, v in the_record_cnts.items())

    # print(cnt_str)

    now2 = datetime.datetime.now()
    end_time = str(now2)
    my_duration = str(now2 - now1)

    the_log = ("Data imported by " + MY_NAME + ". " + cnt_str + " Start: " +
               start_time + ". Finished: " + end_time + " (duration: " +
               my_duration + ").")

    log_range = "log!A:A"
    log_sheet = dataSheet(sheet_id, log_range)

    log_sheet.appendData([[the_log]])

    print(" ")

    print(the_log)
    log_it(SCRIPT_NAME, the_log)
    # digester.post_digest(SCRIPT_NAME, the_log)

    print(" ")

    exit_msg = "Script done. Updated data is available at " + \
        "https://docs.google.com/spreadsheets/d/" + \
        str(sheet_id) + "/edit?usp=sharing"

    print(exit_msg)
    log_it(SCRIPT_NAME, exit_msg)

    quit()
Пример #27
0
def main():
    now1 = datetime.datetime.now()
    start_time = str(now1)
    end_time = ''  # set later
    # day_offset = now1.weekday() + 1 # Calculate the Sunday of current week
    day_offset = 7  # use past seven days, regardless of current day

    print('Script ' + MY_NAME + ' begun at ' + start_time + '. ')

    if not DEBUG:
        the_sheet_id = '1JA5bRSnYV80sx4m5SOFQ6QJ4u21SXvQeNdNbuRVCdds'
    else:
        the_sheet_id = '1e_TAK8eUsaHltBu9J5bNO1twThqt7_nE5olmz2pdCUw'  # test doc
        day_offset = 14  # use past 2 weeks for testing

    # Set date stamp of start of week (Sunday) to determine recently created accessions.
    begin_of_week = (now1 - datetime.timedelta(day_offset)).date()

    the_sheet_rbml = dataSheet(the_sheet_id, 'rbml!A:Z')
    the_sheet_avery = dataSheet(the_sheet_id, 'avery!A:Z')
    the_sheet_rbmlbooks = dataSheet(the_sheet_id, 'rbmlbooks!A:Z')

    # Location to save output
    if DEBUG is True:
        out_folder = "/cul/cul0/ldpd/archivesspace/test/accessions"
    else:
        out_folder = "/cul/cul0/ldpd/archivesspace/accessions"

    rbml_acc_file = os.path.join(out_folder, 'report_rbml_accessions.json')
    avery_acc_file = os.path.join(out_folder, 'report_avery_accessions.json')
    rbmlbooks_acc_file = os.path.join(out_folder,
                                      'report_rbmlbooks_accessions.json')

    print(' ')

    print('Starting accession report in ' +
          'https://docs.google.com/spreadsheets/d/' + str(the_sheet_id) +
          '/edit?usp=sharing')

    if not DEBUG:
        # Save the accessions as json files. In DEBUG mode, just use the files already saved.
        print('Saving Avery accession data to ' + avery_acc_file + '....')

        # Only fetch file if not in Debug mode
        with open(avery_acc_file, "w+") as f:
            try:
                x = asf.getAccessions(3)
                f.write(x)
            except:
                raise ValueError(
                    "There was an error in getting Avery accession data!")

            y = json.loads(x)
            if 'error' in y[0]:
                print(y[0]['error'])

        print('Saving RBML accession data to ' + rbml_acc_file + '....')

        with open(rbml_acc_file, "w+") as f:
            try:
                x = asf.getAccessions(2)
                f.write(x)
            except:
                raise ValueError(
                    "There was an error in getting RBML accession data!")

            y = json.loads(x)
            if 'error' in y[0]:
                print(y[0]['error'])

        print('Saving RBMLBOOKS accession data to ' + rbmlbooks_acc_file +
              '....')

        with open(rbmlbooks_acc_file, "w+") as f:
            try:
                x = asf.getAccessions(6)
                f.write(x)
            except:
                raise ValueError(
                    "There was an error in getting RBMLBOOKS accession data!")

            y = json.loads(x)
            if 'error' in y[0]:
                print(y[0]['error'])

    print(' ')

    # the_files = [
    #         [avery_acc_file, the_sheet_avery],
    #         [rbml_acc_file, the_sheet_rbml]
    #              ]

    the_recents = {}

    the_info = [{
        'repo_name': 'Avery',
        'repo_id': 3,
        'acc_file': avery_acc_file,
        'the_sheet': the_sheet_avery
    }, {
        'repo_name': 'RBML',
        'repo_id': 2,
        'acc_file': rbml_acc_file,
        'the_sheet': the_sheet_rbml
    }, {
        'repo_name': 'RBMLBOOKS',
        'repo_id': 6,
        'acc_file': rbmlbooks_acc_file,
        'the_sheet': the_sheet_rbmlbooks
    }]

    # The top-level elements to save from the JSON (each can be further processed below)
    the_keys = {
        "title": "title",
        "uri": "uri",
        "repository": "repository",
        "accession_date": "accession_date",
        "id_0": "id_0",
        "id_1": "id_1",
        "id_2": "id_2",
        "id_3": "id_3",
        "extents": "extents",
        "related_resources": "related_resources",
        "collection_management": "collection_management",
        "user_defined": "user_defined",
        "create_time": "create_time",
        "system_mtime": "system_mtime",
        "last_modified_by": "last_modified_by"
    }

    ext_dict = {
        "ext-number": "number",
        "ext-portion": "portion",
        "ext-type": "extent_type"
    }
    for f in the_info:

        the_file = f['acc_file']
        the_target = f['the_sheet']
        repo_name = f['repo_name']

        with open(the_file) as f:
            the_data = json.load(f)

        all_rows = []

        for an_accession in the_data:
            # acc_info : prelim dict for each accession. Do things to it.
            acc_info = {}
            for key, value in the_keys.items():
                try:
                    acc_info.update({key: an_accession[value]})
                except (IndexError, KeyError):
                    acc_info.update({key: ""})

            # Refine elements by extracting subelements, etc.

            # Handle collection_management
            cm = acc_info["collection_management"]
            cm_dict = {
                "processing_priority": "processing_priority",
                "processing_status": "processing_status"
            }
            for key, value in cm_dict.items():
                try:
                    acc_info[key] = cm[value]

                except (IndexError, KeyError, TypeError):
                    acc_info[key] = ''

            acc_info.pop("collection_management")

            # Parse resource id and get bibid
            res = acc_info["related_resources"]
            if len(res) > 0:
                res_url = res[0]["ref"]
                repo = res_url.split('/')[2]
                asid = res_url.split('/')[4]
                bibid = asf.lookupBibID(repo, asid, LOOKUP_CSV)
            else:
                bibid = ''
                asid = ''
            acc_info["resource_bibid"] = bibid
            acc_info["resource_asid"] = asid
            acc_info.pop("related_resources")

            # Parse BibID out of user_defined / integer_1
            try:
                usdef = acc_info["user_defined"]
                acc_info['integer_1'] = usdef['integer_1']
            except:
                acc_info['integer_1'] = ''
            acc_info.pop("user_defined")

            # Fix problem with leading "+" in id_3 (add apostrophe for display)
            acc_info["id_3"] = re.sub(r"^\+", "'+", acc_info["id_3"])

            # Handle repository
            repository = acc_info["repository"]
            if len(repository) > 0:
                repo_url = repository["ref"]
                repo = repo_url.split('/')[2]
            else:
                repo = ''
            acc_info["repo"] = repo
            acc_info.pop("repository")

            # Handle date
            acc_date = acc_info["accession_date"]
            yyyy = int(acc_date.split('-')[0])
            mm = int(acc_date.split('-')[1])
            dd = int(acc_date.split('-')[2])
            the_date = datetime.date(yyyy, mm, dd)
            # due to legacy import issue, some with unknown dates have malformed dates like 0002-01-23. Acknowledge their unknownness.
            if the_date.year < 1700:
                acc_info["accession_date"] = "0000-00-00"
                acc_info["year"] = ""
            else:
                acc_info["year"] = the_date.year

            # Fiscal year
            if the_date.year < 1700:
                acc_info["fiscal-year"] = ""
            else:
                if the_date.month > 6:
                    acc_info["fiscal-year"] = the_date.year + 1
                else:
                    acc_info["fiscal-year"] = the_date.year

            # Handle extents
            ext = acc_info["extents"]
            for key, value in ext_dict.items():
                try:
                    acc_info[key] = ext[0][value]
                except (IndexError, KeyError):
                    acc_info[key] = ''

            acc_info.pop("extents")

            # Clean up titles
            acc_info['title'] = str(acc_info['title']).strip()

            # Uncomment to list records in log.
            # print("processing: " + str(acc_info["uri"]).strip() + ' / ' + str(acc_info["title"]).strip() )

            all_rows.append(acc_info)

        processed_msg = 'Processed ' + \
            str(len(all_rows)) + ' records in ' + repo_name + '.'
        print(processed_msg)

        log_it(SCRIPT_NAME, processed_msg)

        # the_heads = list(all_rows[0].keys())

        # explicitly order the columns, as dict order is unpredictable.
        the_heads = [
            'title', 'uri', 'accession_date', 'id_0', 'id_1', 'id_2', 'id_3',
            'integer_1', 'resource_bibid', 'resource_asid', 'repo', 'year',
            'fiscal-year', 'ext-number', 'ext-portion', 'ext-type',
            'processing_priority', 'processing_status', 'create_time',
            'system_mtime', 'last_modified_by'
        ]

        the_output = []

        # Build row in order specified by the_heads
        for a_row in all_rows:
            # r = list(a_row.values())
            r = [a_row[h] for h in the_heads]
            the_output.append(r)
            # print(a_row)

        # sort by accession_date (the 2nd item in inner lists)
        the_output = sorted(the_output, key=itemgetter(2), reverse=True)

        # Get list of recents
        the_recents[repo_name] = []

        for i in the_output:
            # i[18] = the create date column
            i_date = dateutil.parser.isoparse(i[18]).date()

            if i_date > begin_of_week:

                the_recents[repo_name].append(i)

        # If there are recents, list them
        if the_recents[repo_name]:
            print(' ')
            recent_msg = str(len(the_recents[repo_name])) + \
                ' accessions recently added in ' + repo_name + ': '
            print(recent_msg)
            log_it(SCRIPT_NAME, recent_msg)
            print('-----------')
            for r in the_recents[repo_name]:
                print(r[0])
                print(r[1])
                print('Created ' + str(dateutil.parser.isoparse(r[18]).date()))
                print('Last edited by ' + r[20])
                print('-----------')
        else:
            print(' ')
            recent_msg = 'No recently created accessions in ' + repo_name
            print(recent_msg)
            log_it(SCRIPT_NAME, recent_msg)

            # print(the_recents[repo_name])

        the_output.insert(0, the_heads)

        print(' ')

        the_target.clear()

        print('Writing ' + repo_name + ' data to sheet ...')
        the_target.appendData(the_output)

        print(' ')

    # generate log and add to log tab, if exists.
    the_tabs = the_target.initTabs

    now2 = datetime.datetime.now()
    end_time = str(now2)
    my_duration = str(now2 - now1)

    if DEBUG is True:
        the_log = '[TEST] Data imported from ' + target_server + ' by ' + MY_NAME + '. Start: ' + \
            start_time + '. Finished: ' + end_time + \
            ' (duration: ' + my_duration + ').'
    else:
        the_log = 'Data imported from ' + target_server + ' by ' + MY_NAME + '. Start: ' + \
            start_time + '. Finished: ' + end_time + \
            ' (duration: ' + my_duration + ').'

    if 'log' in the_tabs:
        log_range = 'log!A:A'
        # today = datetime.datetime.today().strftime('%c')
        dataSheet(the_sheet_id, log_range).appendData([[the_log]])
    else:
        print('*** Warning: There is no log tab in this sheet. ***')

    print(' ')

    print(the_log)
    log_it(SCRIPT_NAME, the_log)

    print(' ')

    exit_msg = 'Script done. Updated data is available at ' + \
        'https://docs.google.com/spreadsheets/d/' + \
        str(the_sheet_id) + '/edit?usp=sharing'
    print(exit_msg)
    log_it(SCRIPT_NAME, exit_msg)
Пример #28
0
from sheetFeeder import dataSheet  # test
import datetime
from itertools import groupby
from pprint import pprint
import os

digest_sheet = '190p6gnhpakdYD72Eb1PLicdVlAtAxjQ7D_8oee7Tk1U'
digest_range = 'Sheet1!A:Z'

digest_sheet = dataSheet(digest_sheet, digest_range)


def main():

    icons = {
        "right-triangle": "\U000025B6",
    }

    my_name = __file__
    script_name = os.path.basename(my_name)
    # This makes sure the script can be run from any working directory and still find related files.

    now = str(datetime.datetime.now().strftime('%m/%d/%Y %H:%M:%S'))

    print('This 24-hour digest composed at ' + now + ' by ' + script_name +
          '. Contact [email protected] with questions/problems.')

    print(' ')
    print(' ')

    # Format the digest content.
Пример #29
0
# Extract matches from a csv matching on checksums.

import csv
from sheetFeeder import dataSheet

# the_sheet = dataSheet(
#     '1ogPrdAFe1tpoGPxMXXtdrQjaGe1g_XG0OMdSaaxNZs8', 'digital-matches!A:Z')

sheet_id = '1ogPrdAFe1tpoGPxMXXtdrQjaGe1g_XG0OMdSaaxNZs8'

# checksum_sheet = dataSheet(sheet_id, 'ebooks_2011')

the_sheet = dataSheet(sheet_id, 'digital-matches2!A:Z')

# the_csv = '/Users/dwh2128/Documents/Cleanup_Project/fstore-subfolders/911-audio-pres.csv'
# the_checksum_list = '/Users/dwh2128/Documents/Cleanup_Project/fstore-subfolders/911-checksums.csv'
the_checksum_list = '/Users/dwh2128/Documents/Cleanup_Project/fstore-subfolders/ebooks_2011_checksums.csv'
the_csv = '/Users/dwh2128/Documents/Cleanup_Project/duplicates-non-ifp-filtered.csv'

the_sheet.clear()

with open(the_checksum_list) as f:
    the_checksums = [r[0] for r in csv.reader(f)]

the_matches = []
with open(the_csv) as f:
    for r in csv.reader(f):
        if r[0] in the_checksums and '/digital/' in r[1]:
            the_matches.append([r[0], r[1] + r[2]])

# print(the_checksums)
Пример #30
0
def main():

    my_name = __file__

    # This makes sure the script can be run from any working directory and still find related files.
    my_path = os.path.dirname(__file__)

    now1 = datetime.datetime.now()
    start_time = str(now1)
    end_time = ''  #set later

    print('Script ' + my_name + ' begun at ' + start_time + '. ')

    # The Google Sheet to send data to
    the_data_sheet = dataSheet('1tQY9kR5YOh1e7i4dVRsl_GMxpNnUgCkb5X8qJQBAsG0',
                               'validation!A:Z')
    # the_data_sheet = dataSheet('198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY','validation!A:Z') # Test
    # Set path to schema validator (Jing)
    jing_path = os.path.join(my_path,
                             "../resources/jing-20091111/bin/jing.jar")

    schema_filename = 'cul_as_ead.rng'
    schematron_filename = 'cul_as_ead.sch'
    schema_path = os.path.join(my_path, schema_filename)
    schematron_path = os.path.join(my_path, schematron_filename)

    # Use in notification email to distinguish errors/warnings
    icons = {
        'facesmiling': '\U0001F600',
        'redx': '\U0000274C',  # use for parse errors
        'exclamation': '\U00002757',
        'warning': '\U000026A0\U0000FE0F',  # use for schema validation errors
        'qmark': '\U00002753'
    }

    data_folder = '/cul/cul0/ldpd/archivesspace/ead_cache'
    # data_folder = '/opt/dcps/archivesspace/test/ead' # for testing

    # Load files from directory into a list
    the_file_paths = []
    for root, dirs, files in os.walk(os.path.abspath(data_folder)):
        for file in files:
            the_file_paths.append(os.path.join(root, file))

    # The column heads for the report spreadsheet
    the_heads = [
        'bibid', 'file', 'well-formed?', 'valid?', 'schema output',
        'schematron output', 'warning type'
    ]

    the_results = []

    the_results.append(the_heads)

    # counters
    parse_errors = 0
    validation_errors = 0
    sch_warnings = 0

    for a_file in the_file_paths:
        the_file_data = []
        file_name = a_file.split('/')[-1]
        bibid = file_name.split('_')[-1].split('.')[0]

        # print('Processing ' + file_name)
        validation_result = jing_process(jing_path, a_file, schema_path)

        if 'fatal:' in validation_result:
            print(icons['redx'] + ' FATAL ERROR: ' + file_name +
                  ' could not be parsed!')
            wf_status = False
            validation_status = False
            parse_errors += 1
        else:
            wf_status = True
            if 'error:' in validation_result:
                validation_status = False
                print(icons['warning'] + ' ERROR: ' + file_name +
                      ' contains validation errors.')
                validation_errors += 1
            else:
                validation_status = True

        if validation_result:
            validation_result_clean = clean_output(validation_result,
                                                   incl_types=False)[0]
        else:
            validation_result_clean = validation_result

        if wf_status == False:
            schematron_result_clean = '-'
            warning_types = []

        else:
            # print('Result from schematron: ')
            schematron_result = jing_process(jing_path, a_file,
                                             schematron_path)

            if 'error:' in schematron_result:
                print('WARNING: ' + file_name +
                      ' has Schematron rule violations.')
                sch_warnings += 1

            if schematron_result:
                x = clean_output(schematron_result, incl_types=True)
                schematron_result_clean = x[0]
                warning_types = x[1]
            else:
                schematron_result_clean = ''
                warning_types = ''

        the_file_data = [
            bibid, file_name, wf_status, validation_status,
            validation_result_clean, schematron_result_clean,
            ', '.join(warning_types)
        ]

        the_results.append(the_file_data)

    # Write result data to spreadsheet
    the_data_sheet.clear()
    the_data_sheet.appendData(the_results)

    # generate log and add to log tab, if exists.
    the_tabs = the_data_sheet.initTabs

    now2 = datetime.datetime.now()
    end_time = str(now2)
    my_duration = str(now2 - now1)

    the_log = 'EADs from ' + data_folder + ' evaluated by ' + schema_filename + ' and ' + schematron_filename + '. Parse errors: ' + str(
        parse_errors
    ) + '. Schema errors: ' + str(
        validation_errors
    ) + '. Schematron warnings: ' + str(
        sch_warnings
    ) + '. Start: ' + start_time + '. Finished: ' + end_time + ' (duration: ' + my_duration + ').'

    if 'log' in the_tabs:
        log_range = 'log!A:A'
        # today = datetime.datetime.today().strftime('%c')
        dataSheet(the_data_sheet.id, log_range).appendData([[the_log]])
    else:
        print('*** Warning: There is no log tab in this sheet. ***')

    print(' ')

    # print(the_log)

    print('Parse errors: ' + str(parse_errors))
    print('Schema errors: ' + str(validation_errors))
    print('Schematron warnings: ' + str(sch_warnings))

    print(' ')

    print(' ')
    print('Script done. Check report sheet for more details: ' +
          the_data_sheet.url)

    quit()