示例#1
0
def entries_from_file(file, max_records, start_index, proxyinfo, uisettings):
    xmlfile = open(file, 'r')
    if __debug__: log('parsing XML file {}', file)
    try:
        xmlcontent = ElementTree.parse(xmlfile)
        for data in _extracted_data(xmlcontent):
            yield data
    except KeyboardInterrupt:
        msg('Stopped', 'warn', uisettings.colorize)
        yield None
    except Exception as err:
        msg('Error: {}'.format(err), 'error', uisettings.colorize)
        yield None
    finally:
        xmlfile.close()
示例#2
0
def write_csv(filename, tind_results, include_unchanged, all):
    file = open(filename, 'w', newline='')

    # Write the header row.
    text = 'TIND record id'
    for i in range(1, _NUM_URLS + 1):
        text += ',Original URL {},Final URL {}'.format(i, i)
    file.write(text + '\n')
    csvwriter = csv.writer(file, delimiter=',')
    try:
        for item in tind_results:
            if not item:
                if __debug__: log('no data -- stopping')
                break
            if not item.url_data and not all:
                if __debug__:
                    log('no URLs for {} -- not saving'.format(item.id))
                continue
            if not contains_changed_urls(
                    item.url_data) and not (include_unchanged or all):
                if __debug__:
                    log('URLs unchanged for {} -- skipping'.format(item.id))
                continue
            row = [item.id]
            if __debug__: log('writing row for {}'.format(item.id))
            for url_data in item.url_data:
                row.append(url_data.original)
                if url_data.error:
                    row.append('(error: {})'.format(url_data.error))
                else:
                    row.append(url_data.final or '')
            if item.url_data or all:
                csvwriter.writerow(row)
                file.flush()
    except KeyboardInterrupt:
        msg('Interrupted -- closing "{}" and exiting'.format(filename))
    except Exception:
        raise
    finally:
        file.close()
示例#3
0
def print_record(current_index, record, colorize):
    if len(record.url_data) == 0:
        msg('No URLs for {}'.format(record.id), 'warn', colorize)
        return
    for item in record.url_data:
        text = []
        if item.error:
            if colorize:
                text += ['{} error: {}'.format(color(item.original, 'error', colorize),
                                               color(item.error, 'error', colorize))]
            else:
                # If not using colorization, go easy on the use of the
                # 'error' code because the plain-text equivalent is loud.
                text += ['{}: {}'.format(color(item.original, 'error', colorize),
                                         color(item.error, 'info', colorize))]
        elif item.original != item.final:
            text += ['{} => {}'.format(color(item.original, 'info', colorize),
                                       color(item.final, 'cyan', colorize))]
        else:
            text += ['{} {}'.format(color(item.original, 'info', colorize),
                                    color('[unchanged]', 'dark', colorize))]
        msg('({:6}) {}: {}'.format(current_index, record.id,
                                   ('\n          ' + ' '*len(record.id)).join(text)))
示例#4
0
def entries_from_search(search, max_records, start_index, proxyinfo, uisettings):
    # Get results in batches of a certain number of records.
    if max_records and max_records < _FETCH_COUNT:
        search = substituted(search, '&rg=', '&rg=' + str(max_records))
    else:
        search = substituted(search, '&rg=', '&rg=' + str(_FETCH_COUNT))
    # Substitute the output format to be MARCXML.
    search = substituted(search, '&of=', '&of=xm')
    # Remove any 'ot' field because it screws up results.
    search = substituted(search, '&ot=', '')
    # Set starting and stopping points.
    current = start_index
    stop = (start_index + max_records) if max_records else sys.maxsize
    if __debug__: log('query string: {}', search)
    if __debug__: log('getting records starting at {}', start_index)
    if __debug__: log('will stop at {} records', stop)
    # The tind.io output doesn't include the number of records available.  So,
    # when iterating over all results, we must do something ourselves to avoid
    # fetching the last page over and over.  We watch for entries we've seen.
    seen = set()
    # Sometimes the server stops returning values.  Unclear why, but when it
    # happens we may as well stop.  We track it using this variable:
    consecutive_nulls = 0
    while 0 < current < stop and consecutive_nulls < _MAX_NULLS:
        try:
            marcxml = tind_records(search, current, proxyinfo)
            if not marcxml:
                if __debug__: log('no records received')
                current = -1
                consecutive_nulls += 1
                break
            if __debug__: log('looping over {} TIND records', len(marcxml))
            for data in _extracted_data(marcxml, proxyinfo):
                if data.id in seen:
                    stop = 0
                else:
                    seen.add(data.id)
                if not data.url_data:
                    consecutive_nulls += 1
                else:
                    consecutive_nulls = 0
                if not uisettings.quiet:
                    print_record(current, data, uisettings.colorize)
                yield data
                if current >= stop:
                    break
                current += 1
                if proxyinfo.reset:
                    # Don't keep resetting the credentials.
                    proxyinfo.reset = False
        except KeyboardInterrupt:
            msg('Stopped', 'warn', uisettings.colorize)
            current = -1
        except Exception as err:
            msg('Error: {}'.format(err), 'error', uisettings.colorize)
            current = -1
        sleep(0.5)                      # Be nice to the server.
    if current >= stop and consecutive_nulls < _MAX_NULLS:
        if __debug__: log('stopping point reached')
        if not uisettings.quiet:
            msg('Processed {} entries'.format(len(seen)), 'info', uisettings.colorize)
    elif consecutive_nulls >= _MAX_NULLS:
        if not uisettings.quiet:
            msg('Too many consecutive null responses -- something is wrong',
                'error', uisettings.colorize)
    yield None
示例#5
0
def write_xls(filename, tind_results, include_unchanged, all):
    # Create some things we reuse below.
    bold_style = Font(bold=True, underline="single")
    hyperlink_style = Font(underline='single', color='0563C1')
    error_style = Font(color='aa2222')

    # Create a sheet in a new workbook and give it a distinctive style.
    wb = openpyxl.Workbook(write_only=True)
    sheet = wb.create_sheet()
    sheet.title = 'Results'
    sheet.sheet_properties.tabColor = 'f7ba0b'

    # Set the widths of the different columngs to something more convenient.
    column = get_column_letter(1)
    sheet.column_dimensions[column].width = 15
    for idx in range(2, _NUM_URLS * 2 + 2):
        column = get_column_letter(idx)
        sheet.column_dimensions[column].width = 80

    # Set the headings and format them a little bit.
    cell1 = WriteOnlyCell(sheet, value='TIND Identifier')
    cell1.font = bold_style
    row = [cell1]
    for i in range(1, _NUM_URLS + 1):
        cell = WriteOnlyCell(sheet, value='Original URL #{}'.format(i))
        cell.font = bold_style
        row.append(cell)
        cell = WriteOnlyCell(sheet, value='Final URL #{}'.format(i))
        cell.font = bold_style
        row.append(cell)

    # Write the header row.
    sheet.append(row)

    # Now create the data rows.
    try:
        for row_number, item in enumerate(tind_results, 2):
            if not item:
                if __debug__: log('no data -- stopping')
                break
            if not item.url_data and not all:
                if __debug__:
                    log('no URLs for {} -- not saving'.format(item.id))
                continue
            if not contains_changed_urls(
                    item.url_data) and not (include_unchanged or all):
                if __debug__:
                    log('URLs unchanged for {} -- skipping'.format(item.id))
                continue
            if __debug__: log('writing row {}'.format(row_number))
            cell = WriteOnlyCell(sheet, value=item.id)
            cell.value = hyperlink(tind_entry_url(item.id), item.id)
            cell.font = hyperlink_style
            row = [cell]
            for url_data in item.url_data:
                cell = WriteOnlyCell(sheet, value=url_data.original)
                cell.value = hyperlink(url_data.original)
                cell.font = hyperlink_style
                row.append(cell)
                if url_data.error:
                    cell = WriteOnlyCell(sheet,
                                         value='(error: {})'.format(
                                             url_data.error))
                    cell.font = error_style
                else:
                    cell = WriteOnlyCell(sheet, value=url_data.final)
                    cell.value = hyperlink(url_data.final or '')
                    cell.font = hyperlink_style
                row.append(cell)
            sheet.append(row)
    except KeyboardInterrupt:
        msg('Interrupted -- closing "{}" and exiting'.format(filename))
    except Exception:
        raise
    finally:
        wb.save(filename=filename)
示例#6
0
def main(file='F',
         output='R',
         all=False,
         unchanged=False,
         start_at='N',
         total='M',
         user='******',
         pswd='P',
         quiet=False,
         no_color=False,
         no_keyring=False,
         reset=False,
         version=False,
         *search):
    '''Look for caltech.tind.io records containing URLs and return updated URLs.

If not given an explicit search query, it will perform a default search that
looks for records containing URLs in MARC field 856.  If given a search query
on the command line, the string should be a complete search URL as would be
typed into a web browser address bar (or more practically, copied from the
browser address bar after performing some exploratory searches in
caltech.tind.io).  If given a file using the -f option (/f on Windows), the
file should contain MARC XML content.

It is best to quote the search string, using double quotes on Windows and
single quotes on Linux/Unix, to avoid terminal shells interpreting special
characters such as question marks in the search string.  Example (for Windows):

   turf "https://caltech.tind.io/search?ln=en&p=856%3A%27ebrary%27"

By default, this program only writes out entries that have URLs in MARC field
856, and then only those whose URLs are found to dereference to a different
URL after following it.  (That is, by default, it skips writing entries whose
URLs do not change after dereferencing.)  If given the -u flag (/u on
Windows), it will write out entries with URLs even if the URLs are unchanged
after dereferencing.  If given the -a flag (/a on Windows), it will write out
all TIND entries retrieved, even those that have no URLs.

If given the -t option (/t on Windows), it will only fetch and process a
total of that many results instead of all results.  If given the -s (/s on
Windows) option, it will start at that entry instead of starting at number 1;
this is useful if searches are being done in batches or a previous search is
interrupted and you don't want to restart from 1.

If given an output file using the -o option (/o on Windows), the results will
be written to that file.  The format of the file will be deduced from the file
name extension (.csv or .xlsx).  In the absence of a file name extension, it
will default to XLSX format.  If not given an output file, the results will
only be printed to the terminal.

If the URLs to be dereferenced involve a proxy server (such as EZproxy, a
common type of proxy used by academic institutions), it will be necessary to
supply login credentials for the proxy component.  By default, Turf uses the
operating system's keyring/keychain functionality to remember the user name
and password.  If the information does not exist from a previous run, Turf
will query the user interactively for the user name and password, and (unless
the -X or /X argument is given) store them in the user's keyring/keychain so
that it does not have to ask again in the future.  It is also possible to
supply the information directly on the command line using the -u and -p
options (or /u and /p on Windows), but this is discouraged because it is
insecure on multiuser computer systems.

To reset the user name and password (e.g., if a mistake was made the last time
and the wrong credentials were stored in the keyring/keychain system), add the
-R (or /R on Windows) command-line argument to a command.  The next time
Urlup needs to use a proxy login, it will query for the user name and password
again even if an entry already exists in the keyring or keychain.

This program will print information to the terminal as it processes URLs,
unless the option -q (or /q on Windows) is given to make it more quiet.
'''

    # Our defaults are to do things like color the output, which means the
    # command line flags make more sense as negated values (e.g., "nocolor").
    # Dealing with negated variables is confusing, so turn them around here.
    colorize = 'termcolor' in sys.modules and not no_color
    use_keyring = not no_keyring

    # We use default values that provide more intuitive help text printed by
    # plac.  Rewrite the values to things we actually use.
    if file == 'F' and not path.exists('F'):
        file = None
    if output == 'R':
        output = None
    if start_at and start_at == 'N':
        start_at = 1
    if total and total == 'M':
        total = None
    if user == 'U':
        user = None
    if pswd == 'P':
        pswd = None

    # Process arguments.
    if version:
        print_version()
        sys.exit()
    if file and search:
        raise SystemExit(
            color('Cannot use a file and search string simultaneously',
                  'error', colorize))
    if file and not file.endswith('.xml'):
        raise SystemExit(
            color('"{}" does not appear to be an XML file'.format(file),
                  'error', colorize))
    if search:
        if any(item.startswith(('-', '/')) for item in search):
            raise SystemExit(
                color('Command not recognized: {}'.format(search), 'error',
                      colorize))
        else:
            search = search[0]  # Compensate for how plac provides arg value.
    if not search:
        search = _DEFAULT_SEARCH
        msg('No search term provided -- will use default:', 'info', colorize)
        msg(search, 'info', colorize)
    if total and not quiet:
        msg('Will stop after getting {} records'.format(total), 'info',
            colorize)
    if total:
        total = int(total)
    if not output and not quiet:
        msg("No output file specified; results won't be saved.", 'warn',
            colorize)
    elif not quiet:
        msg('Output will be written to {}'.format(output), 'info', colorize)
        if all:
            msg('Saving all results, including those without URLs', 'info',
                colorize)
        else:
            msg('Saving only relevant results', 'info', colorize)
    if output:
        name, extension = path.splitext(output)
        if extension and extension.lower() not in ['.csv', '.xlsx']:
            raise SystemExit(
                color('"{}" has an unrecognized file extension'.format(output),
                      'error', colorize))
        elif not extension:
            msg(
                '"{}" has no name extension; defaulting to xlsx'.format(
                    output), 'warn', colorize)
    start_at = int(start_at)

    # General sanity checks.
    if not network_available():
        raise SystemExit(color('No network', 'error', colorize))

    # Let's do this thing.
    uisettings = UIsettings(colorize=colorize, quiet=quiet)
    proxyinfo = ProxyInfo(user, pswd, use_keyring, reset)
    results = []
    try:
        if file:
            input = None
            if path.exists(file):
                input = file
            elif path.exists(path.join(os.getcwd(), file)):
                input = path.join(os.getcwd(), file)
            else:
                raise SystemExit(
                    color('Cannot find file "{}"'.format(file), 'error',
                          colorize))
            if not quiet:
                msg('Reading MARC XML from {}'.format(input), 'info', colorize)
            results = entries_from_file(input, total, start_at, proxyinfo,
                                        uisettings)
        else:
            results = entries_from_search(search, total, start_at, proxyinfo,
                                          uisettings)
    except Exception as e:
        msg('Exception encountered: {}'.format(e), 'error', colorize)
    finally:
        if not results:
            msg('No results returned.', 'warn', colorize)
        elif output:
            write_results(output, results, unchanged, all)
        else:
            print_results(results)
        if not quiet:
            msg('Done.', 'info', colorize)