Пример #1
0
def z3950_query_manager(target, meta, matchpoint):
    """
    Oversees queries send to Sierra Z3950
    args:
        api_name
        meta obj
        matchpoint
    return:
        query result
    """
    module_logger.debug('Making new Z3950 request to: {}'.format(
        target['host']))
    try:
        result = queries.query_runner(
            'Z3950', target, meta, matchpoint)
        return result
    except ConnectionError:
        module_logger.error('Z3950 Connection error on host {}'.format(
            target['host']))
        raise OverloadError(
            'Connection error. Unable to reach Z3950 host: {}.'.format(
                target))
    except ValueError:
        module_logger.error(
            'Z3950 ValueError on target parameters {}'.format(
                target))
        raise OverloadError(
            'Z3950 target not provided')
Пример #2
0
def create_sheet_for_system(system, auth, sheet_name, tabs, parent_id=None):
    """
    creates Google Sheet of given name, with layout for
    NYPL report
    args:
        system: string, 'NYPL' or 'BPl'
        auth: class 'oauth2client.client.OAuth2Credentials'
        sheet_name: string, name of the spreadsheet
        tabs: list, names of individual sheets
    returns:
        sheet_id: string, GDrive assigned id
    """

    sheet_id = goo.create_sheet(auth, sheet_name, tabs)

    # customize it
    if 'CallNumbers' in sheet_name:
        goo.customize_pvf_callNos_report(auth, sheet_id)
    elif 'Dups' in sheet_name and system == 'NYPL':
        goo.customize_nypl_pvf_dup_report(auth, sheet_id)
    elif 'Dups' in sheet_name and system == 'BPL':
        goo.customize_bpl_pvf_dup_report(auth, sheet_id)

    # move sheet to appropriate folder
    if not goo.file2folder(auth, parent_id, sheet_id):
        module_logger.error('Unable to move sheet {} to folder {}.'.format(
            sheet_id, parent_id))
        raise OverloadError('Failed to move {} document to '
                            'correct GDrive folder'.format(sheet_name))

    return sheet_id
Пример #3
0
def save_stats():
    module_logger.debug('Saving batch stats.')
    batch = shelve.open(BATCH_META)
    timestamp = batch['timestamp']
    system = batch['system']
    library = batch['library']
    agent = batch['agent']
    file_qty = len(batch['file_names'])
    batch.close()

    try:
        df = reports.shelf2dataframe(BATCH_STATS, system)
    except ValueError:
        df = None

    if df is not None:
        stats = reports.create_stats(system, df)

        with session_scope() as session:
            # find out if timestamp already added
            # if not add records
            # add batch record
            record = insert_or_ignore(session,
                                      PVR_Batch,
                                      timestamp=timestamp,
                                      system=system,
                                      library=library,
                                      agent=agent,
                                      file_qty=file_qty)
            session.flush()
            bid = record.bid
            for row in stats.iterrows():
                name = row[1]['vendor']
                record = insert_or_ignore(session, Vendor, name=name)
                session.flush()
                vid = record.vid

                if system == 'nypl':
                    record = insert_or_ignore(session,
                                              PVR_File,
                                              bid=bid,
                                              vid=vid,
                                              new=row[1]['insert'],
                                              dups=row[1]['attach'],
                                              updated=row[1]['update'],
                                              mixed=row[1]['mixed'],
                                              other=row[1]['other'])
                else:
                    record = insert_or_ignore(session,
                                              PVR_File,
                                              bid=bid,
                                              vid=vid,
                                              new=row[1]['insert'],
                                              dups=row[1]['attach'],
                                              updated=row[1]['update'])
    else:
        module_logger.warning(
            'Unable to created dataframe from the BATCH_STATS.')
        raise OverloadError(
            'Encountered problems while trying to save statistics.')
Пример #4
0
def store_connection(name, host, folder, user, password, system):
    if name == '':
        name = None
    if host == '':
        host = None
    if folder == '':
        folder = None
    if system == '':
        system = None
    if user == '':
        user = None
    else:
        user = base64.b64encode(user)
    if password == '':
        password = None
    else:
        password = base64.b64encode(password)

    try:
        with session_scope() as db_session:
            insert_or_ignore(db_session,
                             FTPs,
                             name=name,
                             host=host,
                             folder=folder,
                             user=user,
                             password=password,
                             system=system)
    except IntegrityError as e:
        module_logger.error('Unable to store FTP details. Error: {}'.format(e))
        raise OverloadError('Error. The name of the new connection is\n.'
                            'already used or some of the required elements\n'
                            'are missing')
Пример #5
0
def update_template(otid, record):
    try:
        with session_scope() as session:
            update_nypl_template(session, otid, **record)
    except IntegrityError as e:
        module_logger.error('IntegrityError on template update: {}'.format(e))
        raise OverloadError('Duplicate/missing template name\n'
                            'or missing primary matchpoint')
Пример #6
0
def save_template(record):
    try:
        with session_scope() as session:
            insert_or_ignore(session, NYPLOrderTemplate, **record)
    except IntegrityError as e:
        module_logger.error('IntegrityError on template save: {}'.format(e))
        raise OverloadError('Duplicate/missing template name\n'
                            'or missing primary matchpoint')
Пример #7
0
def connect2ftp(host, user, password):
    module_logger.debug('Connecting to FTP: {}.'.format(host))
    try:
        ftp = FTP(host)
        conn = ftp.login(user, password)
        if conn[:3] == '230':
            module_logger.debug('Successful connection.')
            return ftp
        else:
            module_logger.error(
                'Unsuccessful connection attempt to FTP: {}.'.format(conn))
            raise OverloadError('Unable to connect to FTP.\n'
                                'Error: {}'.format(conn))
    except all_errors as e:
        module_logger.error('Unable to connect to: {}. {}'.format(host, e))
        raise OverloadError('Unable to connect to: {}.\n'
                            'Verify host and your credentials'.format(host))
Пример #8
0
def delete_connection(name, system):
    with session_scope() as db_session:
        try:
            delete_record(db_session, FTPs, name=name, system=system)
        except Exception as exc:
            _, _, exc_traceback = sys.exc_info()
            tb = format_traceback(exc, exc_traceback)
            module_logger.error(
                'Unhandled error of deletion of FTP details. {}'.format(tb))
            raise OverloadError(exc)
Пример #9
0
def store_in_vault(application, user, password):
    """
    stores credentials in Windows Credential Locker
    args:
        applicaiton: string,  name of application
        user: string, name of user
        password: string
    """

    # check if credentials already stored and if so
    # delete and store updated ones
    try:
        if not get_from_vault(application, user):
            keyring.set_password(application, user, password)
        else:
            keyring.delete_password(application, user)
            keyring.set_password(application, user, password)
    except PasswordSetError as e:
        raise OverloadError(e)
    except PasswordDeleteError as e:
        raise OverloadError(e)
Пример #10
0
def sierra_export_reader(source_fh, system, progbar1, progbar2):
    with open(source_fh, "r") as file:
        reader = csv.reader(file)

        # skip header
        header = reader.next()

        # check if Sierra export file has a correct structure
        if system == "NYPL":
            if header != NW2SEXPORT_COLS:
                raise OverloadError(
                    "Sierra Export format incorrect.\nPlease refer to help"
                    "for more info."
                )
        elif system == "BPL":
            if header != BW2SEXPORT_COLS:
                raise OverloadError(
                    "Sierra Export format incorrect.\nPlease refer to help"
                    "for more info."
                )
        estimate_progbars_max(reader, progbar1, progbar2)
Пример #11
0
def save_report(data, outfile):
    # delete previous report
    if not remove_files([outfile]):
        raise OverloadError('Unable to delete previous default '
                            'validation report: {}.'.format(outfile))

    report = []
    for k, v in data.iteritems():
        report.append('\n{} - barcode dups:'.format(k))
        dups = []
        for f, p in v:
            dups.append('\tfile: {} -- record no:{}'.format(f, p))
        report.append('\n'.join(sorted(dups)))

    if report == []:
        report = ['No errors found']
    try:
        with open(outfile, 'w') as file:
            file.write('\n'.join(report))
    except IOError as e:
        raise OverloadError(
            'Unable to create a new default validation report. '
            'Error: {}'.format(e))
Пример #12
0
def platform_queries_manager(api_type, session, meta, matchpoint):
    """
    Oversees queries sent to platform
    args:
        api_type
        session obj
        meta obj
        matchpoint
    return:
        query result
    """
    module_logger.debug('Making new Platform request.')
    try:
        result = queries.query_runner(api_type, session, meta, matchpoint)
        return result

    except APITokenExpiredError:
        session.close()
        raise APITokenExpiredError(
            'Unable to perform query. Platform token expired.')

    except ConnectionError as e:
        module_logger.error('ConnectionError while running Platform queries. '
                            'Closing session and aborting processing.')
        session.close()
        raise OverloadError(e)

    except Timeout as e:
        module_logger.error('Timeout error while running Platform queries. '
                            'Closing session and aborting processing.')
        session.close()
        raise OverloadError(e)

    except ValueError as e:
        session.close()
        module_logger.error(e)
        raise OverloadError(e)
Пример #13
0
def count_bibs(file):
    reader = read_marc21(file)
    bib_count = 0
    try:
        for bib in reader:
            bib_count += 1
        return bib_count
    except RecordLengthInvalid:
        raise OverloadError(
            "Attempted to process non-MARC file,\n"
            "or invalid MARC file: {}".format(file)
        )
    except UnicodeDecodeError:
        raise OverloadError(
            "Character encoding error in file:\n{}\n"
            "Please convert character encoding to UTF-8\n"
            "using MARCEdit program.".format(file)
        )
    except RecordDirectoryInvalid:
        raise OverloadError(
            "Encountered malformed MARC record directory\n"
            'in file "{}".\nUse MARCEdit to identify '
            "incorrect record.".format(file)
        )
Пример #14
0
def set_nypl_sierra_bib_default_location(library, bib):
    """
    adds a 949 MARC tag command for setting bibliographic location
    args:
        bib: pymarc.record.Record
    returns:
        bib: pymarc.record.Record, with added command "bn=" to
            the "949  $a" field, the field is created if missing
    """

    # determine correct location code
    if library == "branches":
        defloc = NBIB_DEFAULT_LOCATIONS["branches"]
    elif library == "research":
        defloc = NBIB_DEFAULT_LOCATIONS["research"]
    else:
        raise OverloadError("Invalid library argument passed: {}".format(library))

    # determine if 949 already preset
    if sierra_command_tag(bib):
        for field in bib.get_fields("949"):
            if field.indicators == [" ", " "]:
                command = field["a"].strip()
                if "bn=" in command:
                    # skip, already present
                    break
                else:
                    if command[-1] == ";":
                        new_command = "{}{}".format(field["a"], "bn={};".format(defloc))
                    else:
                        new_command = "{}{}".format(
                            field["a"], ";bn={};".format(defloc)
                        )
                    field["a"] = new_command
                    break

    else:
        # command tag not preset add
        bib.add_field(
            Field(
                tag="949",
                indicators=[" ", " "],
                subfields=["a", "*bn={};".format(defloc)],
            )
        )
    return bib
Пример #15
0
def decrypt_file_data(key, fh):
    """
    decrypts data in a file
    args:
        key: string,  16-bit encryption key
        fh: string, file handle of file to be decrypted
    returns:
        data: string
    """
    try:
        with open(fh, "rb") as file:
            nonce, tag, ciphertext = [file.read(x) for x in (16, 16, -1)]
            cipher = AES.new(key, AES.MODE_EAX, nonce)
            data = cipher.decrypt_and_verify(ciphertext, tag)
            return data
    except ValueError as e:
        raise OverloadError(e)
Пример #16
0
def barcode_duplicates(batch, system):
    """
    Verifies there are no duplicate barcodes in the batch;
    parses all barcodes found in list of MARC files (batch),
    finds duplicates, and creates a report indicating files
    and records that are dups
    args:
        batch : list of MARC files
    returns:
        dict of dups (key: barcode, value: tuple (file, bib position))
    """
    barcodes = dict()
    dup_barcodes = dict()

    if system == 'nypl':
        item_tag = '949'
        item_tag_ind = [' ', '1']
        item_tag_sub = 'i'
    elif system == 'bpl':
        item_tag = '960'
        item_tag_ind = [' ', ' ']
        item_tag_sub = 'i'

    for fh in batch:
        try:
            reader = read_marc21(fh)
            pos = 0
            for record in reader:
                pos += 1
                for tag in record.get_fields(item_tag):
                    if tag.indicators == item_tag_ind:
                        for b in tag.get_subfields(item_tag_sub):
                            if b in barcodes:
                                new_value = barcodes[b]
                                new_value.append((fh, pos))
                                barcodes[b] = new_value
                            else:
                                barcodes[b] = [(fh, pos)]
        except UnicodeDecodeError as e:
            raise OverloadError(e)

    for k, v in barcodes.iteritems():
        if len(v) > 1:
            dup_barcodes[k] = v

    return dup_barcodes
Пример #17
0
def move2ftp(host, ftp, fh, dstfh, transfer_type):
    try:
        module_logger.debug('Uploading file to FTP: host={}, local path={}, '
                            'destination fh={}, transfer type={}'.format(
                                host, fh, dstfh, transfer_type))
        if transfer_type == 'binary':
            ftp.storbinary('STOR {}'.format(dstfh), open(fh, 'rb'))
        elif transfer_type == 'ASCII':
            ftp.storlines('STOR {}'.format(dstfh), open(fh, 'r'))
        module_logger.debug('Upload successful.')

    except all_errors as e:
        module_logger.error(
            'Upload to FTP failed: host={}, destination fh={}, '
            'transfer type={}. Error: {}'.format(host, dstfh, transfer_type,
                                                 e))
        raise OverloadError(
            'Encountered error while uploading file to FTP.\nAborting.')
Пример #18
0
def move2local(host, ftp, fh, dstfh, transfer_type):
    try:
        module_logger.debug(
            'Downloading file from FTP: host={}, fh={}, destination path={}, '
            'transfer type={}'.format(host, fh, dstfh, transfer_type))
        if transfer_type == 'binary':
            with open(dstfh, 'wb') as f:
                ftp.retrbinary('RETR %s' % fh, lambda data: f.write(data))
        elif transfer_type == 'ASCII':
            with open(dstfh, 'w') as f:
                ftp.retrlines('RETR %s' % fh, lambda data: f.write(data))
        module_logger.debug('Download successful.')

    except all_errors as e:
        module_logger.error(
            'Download from FTP failed: host={}, file on remote={}, '
            'destination path={}, transfer type={}. Error: {}'.format(
                host, fh, dstfh, transfer_type, e))
        raise OverloadError('Encountered error while downloading the file.')
Пример #19
0
def launch_process(
    source_fh,
    data_source,
    system,
    library,
    progbar1,
    progbar2,
    process_label,
    hits,
    nohits,
    skipped,
    meet_crit_counter,
    fail_user_crit_counter,
    fail_glob_crit_counter,
    action,
    encode_level,
    mat_type,
    cat_rules,
    cat_source,
    recap_range,
    id_type="ISBN",
    api=None,
):
    """
    work notes:
    1. iterate through the source files and extract bib/order metadata
    2. temporarily persist this data in local datastore
    3. iterate over the batch and find best hit for each
    4. persist in local store matched record as a pymarc object
    5. display results (with all data needed for Sierra import) to user
    5. allow user to decide what to write to final file

    args:
        source_fh: str, file path
        data_source: str, 'Sierra export' or 'IDs list'
        system: str, 'NYPL' or 'BPL'
        library: str, 'research' or 'branches'
        progbar1: tkinter widget, overall progressbar
        progbar2: tkinter widget, task progressbar
        process_label: tkinter StrinVar, current task label
        hits: tkinter IntVar, hits counter
        nohits: tkinter IntVar, failed search counter
        meet_crit_counter: tkinter IntVar, success match & eval counter
        fail_user_crit_counter: tkinter IntVar, failed user criteria counter
        fail_glob_crit_counter: tkinter IntVar, failed global criteria counter
        action: str, 'catalog' or 'upgrade'
        encode_level: str, 'any', ...
        mat_type: str, 'any', print', 'large print', 'dvd', 'bluray'
        cat_rules: str,  'any', 'RDA-only'
        cat_source: str, 'any', 'DLC'
        recap_range: list, uppper and lower limits of Recap numbers
        id_type: str, 'ISBN', 'UPC', 'ISSN', 'LCCN', 'OCLC #'
        api: str, name of api to be used for queries
    """

    if mat_type == "":
        mat_type = None
    if cat_source == "":
        cat_source = None

    module_logger.debug(
        "Launching W2S process. "
        "Params: source_fh:{}, data_source:{}, system:{}, "
        "library:{}, action:{}, encode_level:{}, mat_type:{}, "
        "cat_rules:{}, cat_source:{}, recap_range:{}, id_type:{}, "
        "api:{}".format(
            source_fh,
            data_source,
            system,
            library,
            action,
            encode_level,
            mat_type,
            cat_rules,
            cat_source,
            recap_range,
            id_type,
            api,
        )
    )

    processed_counter = 0
    found_counter = 0
    not_found_counter = 0
    skipped_counter = 0

    remove_previous_process_data()

    # validate correctness of sierra export
    process_label.set("reading:")

    if data_source == "Sierra export":
        sierra_export_reader(source_fh, system, progbar1, progbar2)
    elif data_source == "IDs list":
        id_list_reader(source_fh, progbar1, progbar2)

    # keep track of recap call numbers
    if recap_range:
        recap_no = recap_range[0]
    else:
        recap_no = None

    with session_scope() as db_session:
        # create batch record
        batch_rec = insert_or_ignore(
            db_session,
            WCSourceBatch,
            file=source_fh,
            system=system,
            library=library,
            action=action,
            api=api,
            data_source=data_source,
            encode_level=encode_level,
            mat_type=mat_type,
            cat_rules=cat_rules,
            cat_source=cat_source,
            id_type=id_type,
        )
        db_session.flush()
        batch_id = batch_rec.wcsbid

        # parse depending on the data source
        if data_source == "IDs list":
            with open(source_fh, "r") as file:
                reader = csv.reader(file)
                # skip header
                reader.next()
                if id_type == "ISBN":
                    for row in reader:
                        meta = BibOrderMeta(
                            system=system, dstLibrary=library, t020=[parse_isbn(row[0])]
                        )
                        insert_or_ignore(
                            db_session, WCSourceMeta, wcsbid=batch_id, meta=meta
                        )
                        update_progbar(progbar1)
                        update_progbar(progbar2)
                elif id_type == "UPC":
                    raise OverloadError("Not implemented.")
                    pass

                    # will be implemented later
                    # for row in reader:
                    #     meta = BibOrderMeta(
                    #         system=system,
                    #         dstLibrary=library,
                    #         t024=[parse_upc(row[0])])

                elif id_type == "OCLC #":
                    for row in reader:
                        meta = BibOrderMeta(
                            system=system, dstLibrary=library, t001=row[0]
                        )
                        insert_or_ignore(
                            db_session, WCSourceMeta, wcsbid=batch_id, meta=meta
                        )
                        update_progbar(progbar1)
                        update_progbar(progbar2)
                else:
                    raise OverloadError("Not implemented.")

        elif data_source == "Sierra export":
            data = sierra_export_data(source_fh, system, library)
            for meta, single_order in data:
                if single_order is None:
                    row = ["b{}a".format(meta.sierraId), meta.title]
                    skipped_counter += 1
                    skipped.set(skipped_counter)
                    save2csv(W2S_SKIPPED_ORD, row)
                    progbar1["maximum"] = progbar1["maximum"] - 3
                elif single_order is False:
                    row = ["b{}a".format(meta.sierraId), meta.title]
                    skipped_counter += 1
                    skipped.set(skipped_counter)
                    save2csv(W2S_MULTI_ORD, row)
                    progbar1["maximum"] = progbar1["maximum"] - 3
                else:
                    insert_or_ignore(
                        db_session, WCSourceMeta, wcsbid=batch_id, meta=meta
                    )
                    update_progbar(progbar1)
                update_progbar(progbar2)

        creds = get_credentials(api)
        wskey = creds["key"]
        db_session.commit()

        # query Worldcat
        process_label.set("querying:")
        # reset progbar2
        progbar2["value"] = 0
        metas = retrieve_records(db_session, WCSourceMeta, wcsbid=batch_id)
        with SearchSession(credentials=wskey) as session:
            for m in metas:
                module_logger.debug(m.meta)
                hit = False
                if m.meta.t001:
                    query = construct_sru_query(
                        m.meta.t001,
                        keyword_type="OCLC #",
                        mat_type=mat_type,
                        cat_source=cat_source,
                    )
                    res = session.sru_query(query=query)
                    module_logger.debug("OCLC# request: {}".format(res.url))

                    hit = interpret_search_response(res, db_session, m.wcsmid)

                    if hit:
                        found_counter += 1

                if m.meta.t010 and not hit:
                    query = construct_sru_query(
                        m.meta.t010,
                        keyword_type="LCCN",
                        mat_type=mat_type,
                        cat_source=cat_source,
                    )
                    res = session.sru_query(query=query)
                    module_logger.debug("LCCN request: {}".format(res.url))

                    hit = interpret_search_response(res, db_session, m.wcsmid)

                    if hit:
                        found_counter += 1

                if m.meta.t020 and not hit:
                    # will iterate over all ISBNs if no hits
                    for isbn in m.meta.t020:
                        query = construct_sru_query(
                            isbn,
                            keyword_type="ISBN",
                            mat_type=mat_type,
                            cat_source=cat_source,
                        )
                        res = session.sru_query(query=query)
                        module_logger.debug("ISBN request: {}".format(res.url))

                        hit = interpret_search_response(res, db_session, m.wcsmid)

                        if hit:
                            found_counter += 1
                            break  # stop searching

                if m.meta.t024 and not hit:
                    for upc in m.meta.t024:
                        query = construct_sru_query(
                            upc,
                            keyword_type="UPC",
                            mat_type=mat_type,
                            cat_source=cat_source,
                        )
                        res = session.sru_query(query=query)
                        module_logger.debug("UPC request: {}".format(res.url))

                        hit = interpret_search_response(res, db_session, m.wcsmid)

                        if hit:
                            found_counter += 1
                            break  # stop searching

                if not hit:
                    not_found_counter += 1
                    module_logger.debug(
                        "Unable to find any matches in Worldcat for {}.".format(m.meta)
                    )
                    interpret_search_response(None, db_session, m.wcsmid)

                hits.set(found_counter)
                nohits.set(not_found_counter)

                update_progbar(progbar1)
                update_progbar(progbar2)
                processed_counter += 1

        db_session.commit()

        # check if meet criteria
        process_label.set("analyzing:")
        progbar2["value"] = 0
        rows = retrieve_records(db_session, WCHit, hit=True)
        for row in rows:
            results = row.query_results
            recs = results2record_list(results)
            for xml_record in recs:
                fulfills = False
                fail_types = []
                if meets_upgrade_criteria(xml_record):
                    if meets_user_criteria(
                        xml_record, encode_level, mat_type, cat_rules, cat_source
                    ):
                        fulfills = True
                        if action == "upgrade":
                            meet_crit_counter.set(meet_crit_counter.get() + 1)

                            oclcNo = get_oclcNo(xml_record)
                            update_hit_record(
                                db_session, WCHit, row.wchid, match_oclcNo=oclcNo
                            )

                            update_progbar(progbar1)
                            update_progbar(progbar2)
                            break

                        elif action == "catalog":
                            if meets_catalog_criteria(xml_record, library):
                                fulfills = True
                                meet_crit_counter.set(meet_crit_counter.get() + 1)
                                oclcNo = get_oclcNo(xml_record)
                                update_hit_record(
                                    db_session, WCHit, row.wchid, match_oclcNo=oclcNo
                                )

                                update_progbar(progbar1)
                                update_progbar(progbar2)
                                break
                            else:
                                fail_types.append("global")
                    else:
                        fail_types.append("user")
                else:
                    fail_types.append("global")

            if not fulfills:
                if "user" in fail_types:
                    fail_user_crit_counter.set(fail_user_crit_counter.get() + 1)
                else:
                    fail_glob_crit_counter.set(fail_glob_crit_counter.get() + 1)

        db_session.commit()

        # download and prep
        process_label.set("downloading:")
        # reset progbar2
        progbar2["value"] = 0

        # obtain access token
        token = get_token(creds)
        if token.token_str is None:
            module_logger.error(
                "Worldcat token not obtained. Error: {}.".format(token.server_response)
            )
        else:
            module_logger.debug("Worldcat token obtained.")

        # open Metadata API session
        with MetadataSession(credentials=token) as session:
            metas = retrieve_related(
                db_session, WCSourceMeta, "wchits", wcsbid=batch_id
            )
            for m in metas:
                if m.wchits.match_oclcNo:
                    xml_record = request_record(session, m.wchits.match_oclcNo)
                    if xml_record is not None:
                        update_hit_record(
                            db_session, WCHit, m.wchits.wchid, match_marcxml=xml_record
                        )
                update_progbar(progbar1)
                update_progbar(progbar2)

        db_session.commit()

        # prepare MARC files
        process_label.set("prepping:")
        progbar2["value"] = 0

        # check if Sierra bib # provided and use
        # for overlay command line
        rows = retrieve_records(db_session, WCSourceMeta, wcsbid=batch_id)

        for row in rows:
            # initial workflow shared by updgrade fuctionality
            xml_record = row.wchits.match_marcxml
            if xml_record is not None:
                marc_record = marcxml2array(xml_record)[0]
                remove_unsupported_subject_headings(system, marc_record)
                remove_unwanted_tags(marc_record)
                remove_ebook_isbns(marc_record)
                marc_record.remove_fields("901", "907", "945", "949", "947")
                initials = create_initials_field(system, library, "W2Sbot")
                marc_record.add_ordered_field(initials)

                if data_source == "Sierra export":
                    order_data = row.meta
                    if order_data.sierraId:
                        overlay_tag = create_target_id_field(
                            system, order_data.sierraId
                        )
                        marc_record.add_ordered_field(overlay_tag)

                if system == "NYPL":
                    marc_record.remove_fields("001", "910")
                    tag_001 = nypl_oclcNo_field(xml_record)
                    marc_record.add_ordered_field(tag_001)

                    # add Sierra bib code 3 and default location
                    if library == "branches":
                        defloc = NBIB_DEFAULT_LOCATIONS["branches"]
                    elif library == "research":
                        defloc = NBIB_DEFAULT_LOCATIONS["research"]

                    tag_949 = create_command_line_field("*b3=h;bn={};".format(defloc))
                    marc_record.add_ordered_field(tag_949)

                if action == "catalog":
                    # add call number & persist
                    if data_source == "Sierra export":
                        order_data = row.meta

                        local_fields = create_local_fields(
                            xml_record,
                            system,
                            library,
                            order_data=order_data,
                            recap_no=recap_no,
                        )

                    else:
                        # data source a list of IDs
                        local_fields = create_local_fields(
                            xml_record, system, library, recap_no=recap_no
                        )

                    if local_fields:
                        for field in local_fields:
                            if field is not None:
                                marc_record.add_ordered_field(field)
                        if system == "NYPL" and library == "research":
                            recap_no += 1

                update_hit_record(
                    db_session, WCHit, row.wchits.wchid, prepped_marc=marc_record
                )

            update_progbar(progbar1)
            update_progbar(progbar2)

            # make sure W2S stays within assigned Recap range
            if system == "NYPL" and library == "research":
                if action == "catalog":
                    if recap_no > recap_range[1]:
                        raise OverloadError(
                            "Used all available ReCAP call numbers " "assigned for W2S."
                        )

    # show completed
    progbar1["value"] = progbar1["maximum"]
    progbar2["value"] = progbar2["maximum"]
Пример #20
0
def open_platform_session(api_name=None):
    """
    wrapper around platform authorization and platform session obj
    args:
        api_type str
        api_name str
    return:
        session obj
    """
    module_logger.debug('Preping to open Platform session.')
    reusing_token = False
    try:
        ud = shelve.open(USER_DATA, writeback=True)

        # retrieve specified Platform authorization
        conn_data = ud['PlatformAPIs'][api_name]
        client_id = base64.b64decode(conn_data['client_id'])
        auth_server = conn_data['oauth_server']
        base_url = conn_data['host']
        last_token = conn_data['last_token']  # encrypt?

        # retrieve secret from Windows Vault
        client_secret = credentials.get_from_vault(auth_server, client_id)

        # check if valid token exists and reuse if can
        if last_token is not None:
            if last_token.get('expires_on') < datetime.now():
                # token expired, request new one
                module_logger.info(
                    'Platform token expired. Requesting new one.')
                auth = AuthorizeAccess(client_id, client_secret, auth_server)
                token = auth.get_token()
            else:
                module_logger.debug(
                    'Last Platform token still valid. Re-using.')
                reusing_token = True
                token = last_token
        else:
            module_logger.debug('Requesting Platform access token.')
            auth = AuthorizeAccess(client_id, client_secret, auth_server)
            token = auth.get_token()

        # save token for reuse
        if not reusing_token:
            module_logger.debug('Saving Platform token for reuse.')
            ud['PlatformAPIs'][api_name]['last_token'] = token

    except KeyError as e:
        module_logger.error(
            'KeyError in user_data: api name: {}. Error msg:{}'.format(
                api_name, e))
        raise OverloadError(
            'Error parsing user_data while retrieving connection info.')

    except ValueError as e:
        module_logger.error(e)
        raise OverloadError(e)

    except APITokenError as e:
        module_logger.error('Platform API Token Error: {}'.format(e))
        raise OverloadError(e)

    except ConnectionError as e:
        module_logger.error('Platform Connection Error: {}'.format(e))
        raise OverloadError(e)

    except Timeout as e:
        module_logger.error('Platform Timeout Error: {}'.format(e))
        raise OverloadError(e)

    finally:
        ud.close()

    # open Platform session
    try:
        module_logger.debug('Auth obtained. Opening Platform session.')
        session = PlatformSession(base_url, token)
        return session

    except ValueError as e:
        module_logger.error(e)
        raise OverloadError(e)

    except APITokenExpiredError as e:
        module_logger.error('Platform API token expired: {}'.format(e))
        raise OverloadError(e)
Пример #21
0
def run_processing(files, system, library, agent, api_type, api_name, template,
                   output_directory, progbar, current_process_label):
    """
    args:
        template: instance of NYPLOrderTemplate class
    """

    # agent argument is 3 letter code

    module_logger.debug('PVR process launched.')

    # tokens and sessions are opened on this level

    # determine destination API
    if api_type == 'Platform API':
        module_logger.debug('Creating Platform API session.')
        try:
            session = open_platform_session(api_name)
        except OverloadError:
            raise
    elif api_type == 'Z3950':
        module_logger.debug(
            'retrieving Z3950 settings for {}'.format(api_name))
        user_data = shelve.open(USER_DATA)
        target = user_data['Z3950s'][api_name]
        user_data.close()

    elif api_type == 'Sierra API':
        module_logger.debug('Connecting to Sierra API')

    # clean-up batch metadata & stats
    if not template:
        template_name = None
    else:
        template_name = template.tName
    module_logger.debug('Opening BATCH_META.')
    batch = shelve.open(BATCH_META, writeback=True)
    module_logger.debug('BATCH_META has been emptied from previous content.')
    timestamp = datetime.now()
    batch['timestamp'] = timestamp
    batch['system'] = system
    batch['library'] = library
    batch['agent'] = agent
    batch['template'] = template_name
    batch['file_names'] = files
    batch.close()
    module_logger.debug('BATCH_META new data: {}, {}, {}, {}, {}, {}'.format(
        timestamp, system, library, agent, template_name, files))

    stats = shelve.open(BATCH_STATS, writeback=True)
    stats.clear()

    if not remove_files(BARCODES):
        module_logger.error(
            'Unable to empty BARCODES storage at location {}'.format(BARCODES))
        raise OverloadError('Unable to delete barcodes from previous batch.')
    else:
        module_logger.debug(
            'BATCH_STATS has been emptied from previous content.')

    # determine output mrc files namehandles
    if agent == 'cat':
        date_today = date.today().strftime('%y%m%d')
        fh_dups = os.path.join(output_directory,
                               '{}.DUP-0.mrc'.format(date_today))
        fh_new = os.path.join(output_directory,
                              '{}.NEW-0.mrc'.format(date_today))

        # delete existing files to start over from scratch
        if not remove_files([fh_new, fh_dups]):
            module_logger.warning(
                'Unable to delete PVF output files from previous batch.')
            raise OverloadError(
                'Unable to delete output files from previous batch.')

    elif agent in ('sel', 'acq'):
        # remove mrc extention if exists
        tail = os.path.split(files[0])[1]
        if tail[-4:] == '.mrc':
            tail = tail[:-4]
        tail = '{}.PRC-0.mrc'.format(tail)
        fh = os.path.join(output_directory, tail)

        # delete existing files to start over from scratch
        if not remove_files([fh]):
            module_logger.warning(
                'Unable to delete PVF output files from previous batch.')
            raise OverloadError(
                'Unable to delete output files from previous batch.')

    # create reference index
    module_logger.debug('Creatig vendor index data for {}-{}'.format(
        system, agent))
    if agent == 'cat':
        rules = './rules/cat_rules.xml'
        vx = vendor_index(rules, system)  # wrap in exception?
    elif agent in ('sel', 'acq'):
        if system == 'nypl':
            query_matchpoints = dict()
            try:
                if template.match1st == 'sierra_id':
                    query_matchpoints['primary'] = ('id', template.match1st)
                else:
                    query_matchpoints['primary'] = ('tag', template.match1st)
                if template.match2nd is not None:
                    if template.match2nd == 'sierra_id':
                        query_matchpoints['secondary'] = ('id',
                                                          template.match2nd)
                    else:
                        query_matchpoints['secondary'] = ('tag',
                                                          template.match2nd)
                if template.match3rd is not None:
                    if template.match3rd == 'sierra_id':
                        query_matchpoints['tertiary'] = ('id',
                                                         template.match3rd)
                    else:
                        query_matchpoints['tertiary'] = ('tag',
                                                         template.match3rd)
            except NoResultFound:
                raise OverloadError('Unable to find template {}.\n'
                                    'Please verify it exists.'.format(
                                        template.tName))
            except AttributeError:
                raise OverloadError('Error while applying order template.')
        else:
            raise OverloadError(
                'selection workflow for BPL not implemented yet')

    # run queries and results analysis for each bib in each file
    n = 0
    f = 0
    for file in files:
        f += 1
        module_logger.debug(
            'Opening new MARC reader for file: {}'.format(file))
        reader = read_marc21(file)

        current_process_label.set('quering...')
        for bib in reader:
            n += 1

            if agent == 'cat':
                vendor = identify_vendor(bib, vx)

                try:
                    query_matchpoints = get_query_matchpoint(vendor, vx)
                    module_logger.debug(
                        'Cat vendor index has following query matchpoints: '
                        '{} for vendor {}.'.format(query_matchpoints, vendor))

                except KeyError:
                    module_logger.critical(
                        'Unable to match vendor {} with data '
                        'in cat vendor index'.format(vendor))
            elif agent in ('sel', 'acq'):
                # vendor code
                if system == 'nypl':
                    vendor = template.vendor
                    if vendor is None:
                        # do not apply but keep for stats
                        vendor = 'UNKNOWN'

            if vendor == 'UNKNOWN':
                module_logger.debug(
                    'Encounted unidentified vendor in record # : {} '
                    'in file {} (system={}, library={}, agent={})'.format(
                        n, file, system, library, agent))

            # determine vendor bib meta
            meta_in = VendorBibMeta(bib, vendor=vendor, dstLibrary=library)
            module_logger.info('Vendor bib meta: {}'.format(str(meta_in)))

            # store barcodes found in vendor files for verification
            module_logger.debug('Storing barcodes for verification.')
            with open(BARCODES, 'a') as barcodes_file:
                for b in meta_in.barcodes:
                    barcodes_file.write(b + '\n')

            # Platform API workflow
            if api_type == 'Platform API':
                matchpoint = query_matchpoints['primary'][1]
                module_logger.debug(
                    'Using primary marchpoint: {}.'.format(matchpoint))
                try:
                    result = run_platform_queries(api_type, session, meta_in,
                                                  matchpoint)

                except APITokenExpiredError:
                    module_logger.info(
                        'Platform token expired. '
                        'Requesting new one and opening new session.')
                    session = open_platform_session(api_name)
                    result = platform_queries_manager(api_type, session,
                                                      meta_in, matchpoint)

                # run_patform_queries returns tuple (status, response in json)
                meta_out = []

                if result[0] == 'hit':
                    meta_out = platform2meta(result[1])

                elif result[0] == 'nohit':
                    # requery with alternative matchpoint
                    if 'secondary' in query_matchpoints:
                        matchpoint = query_matchpoints['secondary'][1]
                        module_logger.debug(
                            'Using secondary marchpoint: {}.'.format(
                                matchpoint))

                        # run platform request for the secondary matchpoint
                        try:
                            result = run_platform_queries(
                                api_type, session, meta_in, matchpoint)

                        except APITokenExpiredError:
                            module_logger.info(
                                'Requesting new Platform token. '
                                'Opening new session.')

                            session = open_platform_session(api_name)
                            result = run_platform_queries(
                                api_type, session, meta_in, matchpoint)
                            # other exceptions raised in run_platform_queries

                        if result[0] == 'hit':
                            meta_out = platform2meta(result[1])
                        elif result[0] == 'nohit':
                            # run query for the 3rd matchpoint
                            if 'tertiary' in query_matchpoints:
                                matchpoint = query_matchpoints['tertiary'][1]
                                module_logger.debug(
                                    'Using tertiary marchpoint: {}.'.format(
                                        matchpoint))

                                # run platform request for the tertiary
                                # matchpoint
                                try:
                                    result = run_platform_queries(
                                        api_type, session, meta_in, matchpoint)

                                except APITokenExpiredError:
                                    module_logger.info(
                                        'Requesting new Platform token. '
                                        'Opening new session.')

                                    session = open_platform_session(api_name)
                                    result = run_platform_queries(
                                        api_type, session, meta_in, matchpoint)
                                if result[0] == 'hit':
                                    meta_out = platform2meta(result[1])
                                elif result[0] == 'error':
                                    raise OverloadError(
                                        'Platform server error.')
                        elif result[0] == 'error':
                            raise OverloadError('Platform server error.')
                    else:
                        module_logger.debug(
                            'No secondary matchpoint specified. '
                            'Ending queries.')
                elif result[0] == 'error':
                    raise OverloadError('Platform server error.')

            # queries performed via Z3950
            elif api_type == 'Z3950':
                meta_out = []
                matchpoint = query_matchpoints['primary'][1]
                module_logger.debug(
                    'Using primary marchpoint: {}.'.format(matchpoint))
                status, bibs = z3950_query_manager(target, meta_in, matchpoint)
                if status == 'hit':
                    meta_out = bibs2meta(bibs)
                elif status == 'nohit' and \
                        'secondary' in query_matchpoints:
                    matchpoint = query_matchpoints['secondary'][1]
                    module_logger.debug(
                        'Using secondary matchpoint: {}'.format(matchpoint))
                    status, bibs = z3950_query_manager(target, meta_in,
                                                       matchpoint)
                    if status == 'hit':
                        meta_out = bibs2meta(bibs)
                    elif status == 'nohit' and \
                            'tertiary' in query_matchpoints:
                        matchpoint = query_matchpoints['tertiary'][1]
                        module_logger.debug(
                            'Using tertiary matchpoint: {}'.format(matchpoint))
                        status, bibs = z3950_query_manager(
                            target, meta_in, matchpoint)
                        if status == 'hit':
                            meta_out = bibs2meta(bibs)
                module_logger.info('Retrieved bibs meta: {}'.format(meta_out))

            # queries performed via Sierra API
            elif api_type == 'Sierra API':
                module_logger.error('Sierra API is not implemented yet.')
                raise OverloadError('Sierra API is not implemented yet.')
            else:
                module_logger.error('Invalid api_type')
                raise OverloadError('Invalid api_type encountered.')

            if system == 'nypl':
                analysis = PVR_NYPLReport(agent, meta_in, meta_out)
            elif system == 'bpl':
                analysis = PVR_BPLReport(agent, meta_in, meta_out)

            module_logger.debug('Analyzing query results and vendor bib')
            analysis = analysis.to_dict()

            # apply patches if needed
            try:
                bib = patches.bib_patches(system, library, agent, vendor, bib)
            except AssertionError as e:
                module_logger.warning(
                    'Unable to patch bib. Error: {}'.format(e))
                analysis['callNo_match'] = False

            module_logger.info('PVF analysis results: {}'.format(analysis))

            # save analysis to shelf for statistical purposes
            stats[str(n)] = analysis

            # output processed records according to analysis
            # add Sierra bib id if matched

            # enforce utf-8 encoding in MARC leader
            bib.leader = bib.leader[:9] + 'a' + bib.leader[10:]

            sierra_id_present = check_sierra_id_presence(system, bib)
            module_logger.debug(
                'Checking if vendor bib has Sierra ID provided: '
                '{}'.format(sierra_id_present))

            if not sierra_id_present and \
                    analysis['target_sierraId'] is not None:

                try:
                    module_logger.info(
                        'Adding target Sierra id ({}) MARC field '
                        'to vendor record {}.'.format(
                            analysis['vendor_id'],
                            analysis['target_sierraId']))
                    bib.add_field(
                        create_target_id_field(system,
                                               analysis['target_sierraId']))

                except ValueError as e:
                    module_logger.error(e)
                    raise OverloadError(e)

            # add fields form bib & order templates
            module_logger.debug(
                'Adding template field(s) to the vendor record.')

            if agent == 'cat':
                templates = vx[vendor].get('bib_template')
                module_logger.debug('Selected CAT templates for {}: {}'.format(
                    vendor, templates))
                for catTemp in templates:
                    # skip if present or always add
                    if catTemp['tag'] == '949' and \
                            analysis['action'] == 'attach':
                        pass
                    elif catTemp['option'] == 'skip':
                        if catTemp['tag'] not in bib:
                            module_logger.debug('Field {} not present, adding '
                                                'from template'.format(
                                                    catTemp['tag']))
                            new_field = create_field_from_template(catTemp)
                            bib.add_field(new_field)
                        else:
                            module_logger.debug(
                                'Field {} found. Skipping.'.format(
                                    catTemp['tag']))
                    elif catTemp['option'] == 'add':
                        module_logger.debug(
                            'Field {} being added without checking '
                            'if already present'.format(catTemp['tag']))
                        new_field = create_field_from_template(catTemp)
                        bib.add_field(new_field)

            elif agent in ('sel', 'acq'):
                # batch template details should be retrieved instead for the
                # whole batch = no need to pull it for each bib

                new_fields = []
                if '960' in bib:
                    for t960 in bib.get_fields('960'):
                        new_field = db_template_to_960(template, t960)
                        if new_field:
                            new_fields.append(new_field)
                    bib.remove_fields('960')
                else:
                    new_field = db_template_to_960(template, None)
                    if new_field:
                        new_fields.append(new_field)

                # add modified fields back to record
                for field in new_fields:
                    bib.add_field(field)

                new_fields = []
                if '961' in bib:
                    for t961 in bib.get_fields('961'):
                        new_field = db_template_to_961(template, t961)
                        if new_field:
                            new_fields.append(new_field)
                    # remove existing fields
                    # (will be replaced by modified ones)
                    bib.remove_fields('961')
                else:
                    new_field = db_template_to_961(template, None)
                    if new_field:
                        new_fields.append(new_field)

                # add modified fields to bib
                for field in new_fields:
                    bib.add_field(field)

                if template.bibFormat and \
                        not sierra_command_tag(bib) and \
                        agent == 'sel':
                    new_field = db_template_to_949(template.bibFormat)
                    bib.add_field(new_field)
                    # it's safer for acquisition to skip command in 949 -
                    # there are conflicts with Import Invoices load table

            # apply bibliographic default location to NYPL brief records
            if system == 'nypl' and agent == 'sel':
                bib = set_nypl_sierra_bib_default_location(library, bib)

            # append to appropirate output file
            if agent == 'cat':
                if analysis['action'] == 'attach':
                    module_logger.debug(
                        'Appending vendor record to the dup file.')
                    write_marc21(fh_dups, bib)
                else:
                    module_logger.debug(
                        'Appending vendor record to the new file.')
                    write_marc21(fh_new, bib)
            else:
                module_logger.debug('Appending vendor record to a prc file.')
                write_marc21(fh, bib)

            # update progbar
            progbar['value'] = n
            progbar.update()

    # dedup new cataloging file
    if agent == 'cat' and os.path.isfile(fh_new):
        current_process_label.set('deduping...')

        dups, combined_count, deduped_fh = dedup_marc_file(fh_new, progbar)

        batch = shelve.open(BATCH_META, writeback=True)
        batch['duplicate_bibs'] = '{} dups merged into {} bibs'.format(
            dups, combined_count)
        batch.close()

        # delete original file and rename deduped
        if deduped_fh is not None:
            try:
                os.remove(fh_new)
                os.rename(deduped_fh, fh_new)
            except WindowsError:
                raise OverloadError('Unable to manipulate deduped file')

    # validate intergrity of process files for cataloging
    files_out = []
    if agent == 'cat':
        if os.path.isfile(fh_dups):
            files_out.append(fh_dups)
        if os.path.isfile(fh_new):
            files_out.append(fh_new)

        valid, missing_barcodes = validate_processed_files_integrity(
            files_out, BARCODES)
        module_logger.debug(
            'Integrity validation: {}, missing_barcodes: {}'.format(
                valid, missing_barcodes))
        if not valid:
            module_logger.error(
                'Barcodes integrity error: {}'.format(missing_barcodes))

    batch = shelve.open(BATCH_META, writeback=True)
    processing_time = datetime.now() - batch['timestamp']

    module_logger.info(
        'Batch processing stats: system={}, library={}, agent={}, user={}, '
        'used template={}, file count={}, files={}, record count={}, '
        'processing time={}'.format(system, library, agent, USER_NAME,
                                    template_name, f,
                                    [os.path.split(file)[1]
                                     for file in files], n, processing_time))
    batch['processing_time'] = processing_time
    batch['processed_files'] = f
    batch['processed_bibs'] = n
    if agent == 'cat':
        batch['processed_integrity'] = valid
        batch['missing_barcodes'] = missing_barcodes
    batch.close()
    stats.close()

    # clean-up
    # close any open session if Platform or Sierra API has been used
    if api_type in ('Platform API', 'Sierra API') and session is not None:
        session.close()
        module_logger.debug('Closing API session.')

    if agent == 'cat' and not valid:
        raise OverloadError(
            'Duplicate or missing barcodes found in processed files.')
Пример #22
0
def read_ftp_content(ftp, host):
    module_logger.debug(
        'Accessing FTP ({}) directory & file listing'.format(host))
    # create a list of directories and files
    ls = []
    try:
        ftp.retrlines('LIST', ls.append)
    except all_errors as e:
        module_logger.error(
            'Unable to retrieve file & directory list on FTP host {}.'
            'Error: {}'.format(host, e))
        raise OverloadError('Encountered error while retrieving\n'
                            'content of the FTP server.')

    # load available FTP parsing methods
    try:
        ftp_settings = open('./rules/ftp_parsing.json', 'r')
        fs = json.load(ftp_settings)
    except ValueError as e:
        module_logger.error(
            'FTP settings JSON file malformed. Error: {}'.format(e))
        raise OverloadError('Unable to access FTP parsing methods')
    finally:
        ftp_settings.close()

    # determine FTP server response parsing
    try:
        m = fs[host]
    except KeyError:
        module_logger.error(
            'Accessing parsing info for unidentified FTP host: {}'.format(
                host))
        raise OverloadError('Unidentified FTP host.')
    if m:
        dirs = []
        files = []
        try:
            for l in ls:
                if l[m['dir_mark'][1]:m['dir_mark'][2] + 1] == \
                        m['dir_mark'][0]:
                    d = l[m['dir_handle']:].strip()
                    dirs.append(d)
                elif l[m['file_mark'][1]:m['file_mark'][2] + 1] == \
                        m['file_mark'][0]:
                    f = l[m['file_handle']:].strip()
                    s = l[m['file_size_pos'][0]:m['file_size_pos'][1] +
                          1].strip()

                    # timestamp
                    t = l[m['file_time_pos'][0]:m['file_time_pos'][1] +
                          1].strip()
                    patterns = m['time_patterns']
                    for p in patterns:
                        try:
                            t = convert2date_obj(t, p)
                            break
                        except ValueError:
                            pass
                    files.append((f, s, t))
            return (dirs, files)
        except KeyError as e:
            module_logger.error(
                'FTP parsing settings for {} are malformed. Error: {}'.format(
                    host, e))
            raise OverloadError('FTP parsing settings error.')
        except IndexError as e:
            module_logger.error(
                'FTP parsing settigns for {} are incorrect. Error: {}'.format(
                    host, e))
            raise OverloadError('FTP parsing settings error.')
    else:
        module_logger.error(
            'Unable to parse FTP response to LIST cmd on host {}'.format(host))
        raise OverloadError('Unable to parse FTP response.')
Пример #23
0
def launch_process(
    system,
    library,
    target,
    id_type,
    action,
    source_fh,
    dst_fh,
    progbar,
    hit_counter,
    nohit_counter,
):
    """
    manages retrieval of bibs or bibs numbers based on
    args:
        system: str, NYPL or BPL
        library: str, branches or research
        target: dict, keys = name, method
        id_type: str, one of ISBN, ISSN, LCCN, OCLC number, or UPC
        output: str, MARC or bib #
        dst_fh: str, path to destination file
        progbar: tkinter widget

    """
    # temp report
    timestamp_start = datetime.now()
    try:
        os.remove(GETBIB_REP)
    except WindowsError:
        pass
    header = None

    # calc progbar maximum and dedup
    ids = []
    dups = set()
    with open(source_fh) as source:
        reader = csv.reader(source)
        # skip header
        reader.next()
        c = 0
        d = 0
        for row in reader:
            rid = row[0].strip()
            if rid:
                c += 1
                if rid in ids:
                    d += 1
                    dups.add(rid)
                else:
                    ids.append(rid)
        progbar["maximum"] = c

    # determine correct matchpoint based on id_type
    if id_type == "ISBN":
        matchpoint = "020"
    elif id_type == "ISSN":
        matchpoint = "022"
    elif id_type == "UPC":
        matchpoint = "024"
    elif id_type == "OCLC #":
        matchpoint = "001"
    else:
        raise OverloadError("Query by {} not yet implemented".format(id_type))

    # determine destination API
    if target["method"] == "Platform API":
        module_logger.debug("Creating Platform API session.")
        try:
            session = open_platform_session(target["name"])
        except OverloadError:
            raise
    elif target["method"] == "Z3950":
        module_logger.debug("retrieving Z3950 settings for {}".format(
            target["name"]))
        user_data = shelve.open(USER_DATA)
        target = user_data["Z3950s"][target["name"]]
        user_data.close()

    for i in ids:
        meta_in = BibOrderMeta(system=system,
                               dstLibrary=library)  # like vendor meta in PVR
        meta_in.dstLibrary = library
        if id_type == "ISBN":
            meta_in.t020 = [i]
        elif id_type == "ISSN":
            meta_in.t022 = [i]
        elif id_type == "UPC":
            meta_in.t024 = [i]
        elif id_type == "OCLC #":
            meta_in.t001 = i

        module_logger.debug(str(meta_in))

        # query NYPL Platform
        if target["method"] == "Platform API":
            try:
                result = platform_queries_manager(target["method"], session,
                                                  meta_in, matchpoint)
            except APITokenExpiredError:
                module_logger.info(
                    "Platform token expired. "
                    "Requesting new one and opening new session.")
                session = open_platform_session(target["method"])
                result = platform_queries_manager(target["method"], session,
                                                  meta_in, matchpoint)

            meta_out = []
            if result[0] == "hit":
                hit_counter.set(hit_counter.get() + 1)
                meta_out = platform2meta(result[1])
            elif result[0] == "nohit":
                nohit_counter.set(nohit_counter.get() + 1)

        elif target["method"] == "Z3950":
            meta_out = []
            status, bibs = z3950_query_manager(target, meta_in, matchpoint)
            if status == "hit":
                hit_counter.set(hit_counter.get() + 1)
                meta_out = bibs2meta(bibs)
            elif status == "nohit":
                nohit_counter.set(nohit_counter.get() + 1)

        if system == "NYPL":
            analysis = PVR_NYPLReport("cat", meta_in, meta_out)
        elif system == "BPL":
            analysis = PVR_BPLReport("cat", meta_in, meta_out)
        module_logger.debug(str(analysis))

        if not header:
            header = analysis.to_dict().keys()
            header.insert(0, "pos")
            save2csv(GETBIB_REP, header)
        if analysis.target_sierraId:
            analysis.target_sierraId = "b{}a".format(analysis.target_sierraId)
        row = analysis.to_dict().values()
        row.insert(0, progbar["value"])
        save2csv(GETBIB_REP, row)

        progbar["value"] += 1
        progbar.update()

    # record data about the batch
    timestamp_end = datetime.now()
    user_data = shelve.open(USER_DATA)
    user_data["getbib_batch"] = {
        "timestamp": timestamp_start,
        "time_elapsed": timestamp_end - timestamp_start,
        "total_queries": c,
        "target": target,
        "hits": hit_counter.get(),
        "misses": nohit_counter.get(),
        "dup_count": d,
        "dups": dups,
    }
    user_data.close()
    progbar["value"] = progbar["maximum"]
Пример #24
0
def validate_files(system, agent, files, marcval=False, locval=False):

    valid_files = True
    # mandatory, default validation

    try:
        dup_barcodes = default.barcode_duplicates(files, system)
        if dup_barcodes != {}:
            valid_files = False
        default.save_report(dup_barcodes, DVAL_REP)
    except OverloadError as e:
        module_logger.error('Unable to create default validation report. '
                            'Error: {}'.format(e))
        raise OverloadError(e)

    # MARCEdit MARC syntax validation
    if marcval:
        module_logger.debug('Running MARCEdit validation.')
        # make sure MARCEdit is installed on the machine
        val_engine = marcedit.get_engine()
        if val_engine is None:
            # display error message
            raise OverloadError(
                'Failed to locate cmarcedit.exe or marcrules.txt\n'
                'files of MARCEdit program. Unable to complete\n'
                'MARC validation. Please uncheck the box if you\n'
                'still want to proceed.')
        else:
            cme = val_engine[0]
            rules = val_engine[1]
            report_q = MVAL_REP
            overwrite = True
            for file in files:
                file_q = file
                success_process = marcedit.validate(cme, file_q, report_q,
                                                    rules, overwrite)
                overwrite = False
                if success_process:
                    result = marcedit.validation_check(MVAL_REP)
                    if not result[0]:
                        valid_files = False
                else:
                    valid_files = False
                    raise OverloadError(
                        'Encounted a problem with the file:\n'
                        '{}.\nNot able to validate in MARCEdit'.format(file))

    # delete previous local spec report
    if not remove_files([LSPEC_REP]):
        module_logger.error(
            'Unable to delete pevious local spec validation report.')
        raise OverloadError(
            'Unable to remove previous local spec validation report.')

    # local specification validation
    if locval:
        module_logger.debug('Local specs validation launch.')

        # define local specs rules for each system, agent, and vendor
        try:
            rules = './rules/vendor_specs.xml'
            specs = local_specs.local_specs(system, agent, rules)
        except AttributeError as e:
            module_logger.error('Unable to parse local specs rules.'
                                'Error: {}'.format(e))
            raise OverloadError(e)

        # run the local specs validation
        locval_passed, report = local_specs.local_specs_validation(
            system, files, specs)
        if not locval_passed:
            valid_files = False

        # save the report to a file so the last batch is always remembered.
        try:
            with open(LSPEC_REP, 'w') as file:
                file.write(report)
        except IOError as e:
            module_logger.error(
                'Encountered error while creating local specs validation'
                ' report. Error: {}'.format(e))
            raise OverloadError(
                'Unable to create local spec validation\nreport.')

    return valid_files