예제 #1
0
    def __init__(self, ingestRecord, ingestDateTime=None):
        # Initialize with empty SFR data objects
        # self.ingest contains the source data
        self.work = WorkRecord()
        self.ingest = ingestRecord
        self.instance = InstanceRecord()
        self.item = Format()
        self.rights = Rights()
        self.modified = ingestDateTime
        logger.debug('Initializing empty HathiRecord object')

        # We need a fallback modified date if none is provided
        if self.modified is None:
            self.modified = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            logger.debug(
                'Assigning generated timestamp of {} to new record'.format(
                    self.modified))
        elif type(self.modified) is datetime:
            self.modified = self.modified.strftime('%Y-%m-%d %H:%M:%S')
예제 #2
0
def transformMARC(record, marcRels):
    """Accepts a marcalyx object and transforms the MARC record into a SFR
    data object.
    """
    doabID = record[0]
    dateIssued = record[1]
    marcRecord = record[2]
    logger.info('Transforming record {} into a SFR object'.format(doabID))

    work = WorkRecord()
    instance = InstanceRecord()
    item = Format(source='doab', contentType='ebook')

    # Add issued date to work record
    work.addClassItem(
        'dates', Date, **{
            'display_date': dateIssued,
            'date_range': dateIssued,
            'date_type': 'issued'
        })

    # All DOAB records have the same CreativeCommons license, assign this
    # to Instance/Item records
    rights = Rights(
        source='doab',
        license='https://creativecommons.org/licenses/by-nc-nd/4.0/',
        statement=
        'Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International'
    )
    instance.rights.append(rights)
    item.rights.append(rights)

    # A single DOAB identifier can be assigned to the work/instance/item records
    doabIdentifier = Identifier(type='doab', identifier=doabID, weight=1)
    work.identifiers.append(doabIdentifier)
    instance.identifiers.append(doabIdentifier)
    item.identifiers.append(doabIdentifier)

    # Code Fields (Identifiers)
    logger.debug('Parsing 0X0-0XX Fields')
    controlData = [('010', 'identifiers', 'a', 'lccn'),
                   ('020', 'identifiers', 'a', 'isbn'),
                   ('022', 'identifiers', 'a', 'issn'),
                   ('050', 'identifiers', 'a', 'lcc'),
                   ('082', 'identifiers', 'a', 'ddc'),
                   ('010', 'identifiers', 'z', 'lccn'),
                   ('020', 'identifiers', 'z', 'isbn'),
                   ('022', 'identifiers', 'z', 'issn'),
                   ('050', 'identifiers', 'z', 'lcc'),
                   ('082', 'identifiers', 'z', 'ddc')]
    for field in controlData:
        extractSubfieldValue(marcRecord, work, field)
        extractSubfieldValue(marcRecord, instance, field)

    # Author/Creator Fields
    logger.debug('Parsing 100, 110 & 111 Fields')
    agentData = ['100', '110', '111', '700', '710', '711']
    for agentField in agentData:
        extractAgentValue(marcRecord, work, agentField, marcRels)

    # Title Fields
    logger.debug('Parsing 21X-24X Fields')
    titleData = [('210', 'alt_titles', 'a'), ('222', 'alt_titles', 'a'),
                 ('242', 'alt_titles', 'a'), ('246', 'alt_titles', 'a'),
                 ('247', 'alt_titles', 'a'), ('245', 'title', 'a'),
                 ('245', 'sub_title', 'b')]
    for field in titleData:
        extractSubfieldValue(marcRecord, work, field)
        extractSubfieldValue(marcRecord, instance, field)

    # Edition Fields
    logger.debug('Parsing Edition (250 & 260) Fields')
    editionData = [('250', 'edition_statement', 'a'),
                   ('250', 'edition_statement', 'b'),
                   ('260', 'pub_place', 'a'), ('260', 'pub_date', 'c'),
                   ('260', 'agents', 'b', 'publisher'),
                   ('260', 'agents', 'f', 'manufacturer'),
                   ('264', 'copyright_date', 'c')]
    for field in editionData:
        extractSubfieldValue(marcRecord, instance, field)

    # Physical Details
    # TODO Load fields into items/measurements?
    logger.debug('Parsing Extent (300) Field')
    extentData = [('300', 'extent', 'a'), ('300', 'extent', 'b'),
                  ('300', 'extent', 'c'), ('300', 'extent', 'e'),
                  ('300', 'extent', 'f')]
    for field in extentData:
        extractSubfieldValue(marcRecord, instance, field)

    # Series Details
    logger.debug('Parsing Series (490) Field')
    seriesData = [('490', 'series', 'a'), ('490', 'series_position', 'v')]
    for field in seriesData:
        extractSubfieldValue(marcRecord, work, field)

    # Notes/Description details
    # TODO What fields should we bring in?
    logger.debug('Parsing TOC (505) Field')
    tocData = [('505', 'table_of_contents', 'a'), ('520', 'summary', 'a')]
    for field in tocData:
        extractSubfieldValue(marcRecord, instance, field)

    # Language Fields
    if len(marcRecord['546']) > 0:
        for lang in marcRecord['546'][0].subfield('a'):
            langs = re.split(r'/|\|', lang.value)
            for language in langs:
                logger.debug(
                    'Adding language {} to work and instance'.format(language))
                langObj = pycountry.languages.get(name=language.strip())
                if langObj is None or langObj.alpha_3 == 'und':
                    logger.warning(
                        'Unable to parse language {}'.format(language))
                    continue
                sfrLang = Language(language=language,
                                   iso_2=langObj.alpha_2,
                                   iso_3=langObj.alpha_3)
                work.language.append(sfrLang)
                instance.language.append(sfrLang)

    # Subject Details
    logger.debug('Parsing 6XX Subject Fields')
    subjectData = ['600', '610', '648', '650', '651', '655', '656', '657']
    for subjectType in subjectData:
        extractSubjects(marcRecord, work, subjectType)

    # Eletronic Holding Details
    logger.debug('Parsing 856 (Electronic Holding) Field')
    extractHoldingsLinks(marcRecord['856'], instance, item)

    # TODO Load data for these fields
    # 76X-78X
    # 80X-83X
    instance.formats.append(item)
    work.instances.append(instance)
    return work, doabID
예제 #3
0
class HathiRecord():
    """Class for constructing HathiTrust-based records in the SFR data model.
    This largely serves as a wrapper for classes imported from the SFR model,
    and includes functions that can build these up. It also contains several
    class-level lookup tables for codes/values provided in the Hathi CSV files.
    """

    # These codes are supplied by Hathi as the determination of an item's
    # rights status.
    rightsReasons = {
        'bib': 'bibliographically-dervied by automatic processes',
        'ncn': 'no printed copyright notice',
        'con': 'contractual agreement with copyright holder on file',
        'ddd': 'due diligence documentation on file',
        'man': 'manual access control override; see note for details',
        'pvt': 'private personal information visible',
        'ren': 'copyright renewal research was conducted',
        'nfi': 'needs further investigation (copyright research partially complete)',
        'cdpp': 'title page or verso contain copyright date and/or place of publication information not in bib record',
        'ipma': 'in-print and market availability research was conducted',
        'unp': 'unpublished work',
        'gfv': 'Google viewability set at VIEW_FULL',
        'crms': 'derived from multiple reviews in the Copyright Review Management System',
        'add': 'author death date research was conducted or notification was received from authoritative source',
        'exp': 'expiration of copyright term for non-US work with corporate author',
        'del': 'deleted from the repository; see not for details',
        'gatt': 'non-US public domain work restroted to in-copyright in the US by GATT',
        'supp': 'suppressed from view; see note for details'
    }

    # Decodes rights statements into full licenses (CreativeCommons links where
    # possible), and human-readable statements.
    rightsValues = {
        'pd': {
            'license': 'public_domain',
            'statement': 'Public Domain'
        },
        'ic': {
            'license': 'in_copyright',
            'statement': 'In Copyright'
        },
        'op': {
            'license': 'in_copyright (out_of_print)',
            'statement': 'Out of Print (implied to be in copyright)'
        },
        'orph': {
            'license': 'in_copyright (orphaned)',
            'statement': 'Copyright Orphaned (implied to be in copyright)'
        },
        'und': {
            'license': 'undetermined',
            'statement': 'Status Undetermined'
        },
        'ic-world': {
            'license': 'in_copyright (viewable)',
            'statement': 'In Copyright, permitted to be world viewable'
        },
        'nobody': {
            'license': 'in_copyright (blocked)',
            'statement': 'Blocked for all users'
        },
        'pdus': {
            'license': 'public_domain (us_only)',
            'statement': 'Public Domain when viewed in the US'
        },
        'cc-by-3.0': {
            'license': 'https://creativecommons.org/licenses/by/3.0/',
            'statement': 'Creative Commons Attribution License, 3.0 Unported'
        },
        'cc-by-nc-3.0': {
            'license': 'https://creativecommons.org/licenses/by-nc-sa/3.0/',
            'statement': 'Creative Commons Attribution, Non-Commercial, 3.0 Unported'
        },
        'cc-by-nc-sa-3.0': {
            'license': 'https://creativecommons.org/licenses/by-nc-sa/3.0/',
            'statement': 'Creative Commons Attribution, Non-Commercial, Share Alike License, 3.0 Unported'
        },
        'cc-by-nd-3.0': {
            'license': 'https://creativecommons.org/licenses/by-nd/3.0/',
            'statement': 'Creative Commons Attribution, No Derivatives License, 3.0 Unported'
        },
        'cc-by-nc-nd-3.0': {
            'license': 'https://creativecommons.org/licenses/by-nc-nd/3.0/',
            'statement': 'Creative Commons Attribution, Non-Commercial, Share Alike License, 3.0 Unported'
        },
        'cc-by-sa-3.0': {
            'license': 'https://creativecommons.org/licenses/by-sa/3.0/',
            'statement': 'Creative Commons Attribution, Share Alike License, 3.0 Unported'
        },
        'orphcand': {
            'license': 'in_copyright (90-day hold)',
            'statement': 'Orphan Candidate - in 90-day holding period'
        },
        'cc-zero': {
            'license': 'https://creativecommons.org/publicdomain/zero/1.0/',
            'statement': 'Creative Commons Universal Public Domain'
        },
        'und-world': {
            'license': 'undetermined',
            'statement': 'Copyright Status undetermined, world viewable'
        },
        'icus': {
            'license': 'in_copyright (in US)',
            'statement': 'In Copyright in the US'
        },
        'cc-by-4.0': {
            'license': 'https://creativecommons.org/licenses/by/4.0/',
            'statement': 'Creative Commons Attribution 4.0 International License'
        },
        'cc-by-nd-4.0': {
            'license': 'https://creativecommons.org/licenses/by-nd/4.0/',
            'statement': 'Creative Commons Attribution, No Derivatives 4.0 International License'
        },
        'cc-by-nc-nd-4.0': {
            'license': 'https://creativecommons.org/licenses/by-nc-nd/4.0/',
            'statement': 'Creative Commons Attribution, Non-Commercial, No Derivatives 4.0 International License'
        },
        'cc-by-nc-4.0': {
            'license': 'https://creativecommons.org/licenses/by-nc/4.0/',
            'statement': 'Creative Commons Attribution, Non-Commercial 4.0 International License'
        },
        'cc-by-nc-sa-4.0': {
            'license': 'https://creativecommons.org/licenses/by-nc-sa/4.0/',
            'statement': 'Creative Commons Attribution, Non-Commercial, Share Alike 4.0 International License'
        },
        'cc-by-sa-4.0': {
            'license': 'https://creativecommons.org/licenses/by-sa/4.0/',
            'statement': 'Creative Commons Attribution, Share Alike 4.0 International License'
        },
        'pd-pvt': {
            'license': 'public_domain (privacy_limited)',
            'statement': 'Public Domain access limited for privacy concerns'
        },
        'supp': {
            'license': 'suppressed',
            'statement': 'Suppressed from view'
        }
    }

    # List of institution codes for organizations that have contributed
    # materials to HathiTrust
    sourceCodes = {
        'allegheny': 'Allegheny College',
        'amherst': 'Amherst College',
        'archive': 'Internet Archive',
        'arizona': 'University of Arizona',
        'asu': 'Arizona State University',
        'aub': 'American University of Beirut',
        'auburn': 'Auburn University',
        'auckland': 'University of Auckland',
        'augusta': 'Augusta University',
        'baylor': 'Baylor University',
        'bc': 'Boston College',
        'bently-umich': 'Bentley Historical Library, University of Michigan',
        'berkeley': 'University of California, Berkeley',
        'borndigital': 'Born Digital',
        'brandeis': 'Brandeis University',
        'brooklynmuseum': 'Brooklyn Museum',
        'brown': 'Brown University',
        'brynmawr': 'Bryn Mawr College',
        'bu': 'Boston University',
        'bucknell': 'Bucknell University',
        'buffalo': 'University At Buffalo, The State University of New York',
        'byu': 'Brigham Young University',
        'carleton': 'Carleton College',
        'case': 'Case Western Reserve University',
        'cgu': 'Claremont Graduate University',
        'chtanc': 'National Library of Taiwan',
        'claremont': 'Claremont University Consortium',
        'clark': 'Clark University',
        'clarkart': 'Sterling and Francine Clark Art Institute',
        'clements-umich': 'William L. Clements Library, University of Michigan',
        'clemson': 'Clemson University',
        'cmc': 'Claremont McKenna College',
        'colby': 'Colby College',
        'colorado': 'University of Colorado Boulder',
        'columbia': 'Columbia University',
        'coo': 'Cornell University',
        'cornell': 'Cornell University',
        'dartmouth': 'Dartmouth College',
        'depaul': 'DePaul University',
        'dickinson': 'Dickinson College',
        'duke': 'Duke University',
        'elon': 'Elon University',
        'emory': 'Emory University',
        'fau': 'Florida Atlantic University',
        'fiu': 'Florida International University',
        'flbog': 'State University System of Florida',
        'frick': 'The Frick Collection',
        'fsu': 'Florida State University',
        'gatech': 'Georgia Institute of Technology',
        'gc-cuny': 'CUNY Graduate School and University Center',
        'georgetown': 'Georgetown University',
        'getty': 'The Getty Research Institute',
        'gettyshib': 'Getty Research Institute',
        'gmu': 'George Mason University',
        'grinnell': 'Grinnell College',
        'google': 'Google',
        'gsu': 'Georgia State University',
        'harvard': 'Harvard University',
        'hathitrust': 'HathiTrust',
        'haverford': 'Haverford College',
        'hawaii': 'University of Hawaii',
        'hmc': 'Harvey Mudd College',
        'ht_private': 'Private Donor',
        'ht_support-microsoft': 'Digitization Support from Microsoft',
        'ia': 'Internet Archive',
        'ias': 'IAS, via Princeton University',
        'illinois': 'University of Illinois at Urbana-Champaign',
        'iu': 'Indiana University',
        'jhu': 'Johns Hopkins University',
        'keio': 'Keio University',
        'kennesaw': 'Kennesaw State University',
        'knowledgeunlatched': 'Knowledge Unlatched',
        'ksu': 'Kansas State University',
        'ku': 'University of Kansas',
        'lafayette': 'Lafayette College',
        'lit-dlps-dc': 'University of Michigan Library IT, Digital Library Production Service, Digital Conversion',
        'loc': 'Library of Congress',
        'macalester': 'Macalester College',
        'mcgill': 'McGill University',
        'mcmaster': 'McMaster University',
        'mdanderson': 'University of Texas M.D. Anderson Cancer Center',
        'mdl': 'Minnesota Digital Library',
        'mhs': 'Minnesota Historical Society',
        'miami': 'University of Miami',
        'milproj-dc-umich': 'Millennium Project',
        'missouri': 'University of Missouri-Columbia',
        'mit': 'Massachusetts Institute of Technology',
        'mndigital': 'Minnesota Digital Library',
        'mnhs': 'Minnesota Historical Society',
        'monash': 'Monash University',
        'montana': 'Montana State University',
        'mou': 'University of Missouri',
        'msu': 'Michigan State University',
        'mtholyoke': 'Mount Holyoke College',
        'ncl': 'National Central Library',
        'ncsu': 'North Carolina State University',
        'nd': 'University of Notre Dame',
        'neu': 'Northeastern University',
        'nfb': 'National Federation of the Blind',
        'nmsu': 'New Mexico State University - Las Cruces Campus',
        'nnc': 'Columbia University',
        'northwestern': 'Northwestern University',
        'nypl': 'New York Public Library',
        'nyu': 'New York University',
        'okstate': 'Oklahoma State University',
        'olemiss': 'University of Mississippi',
        'osu': 'The Ohio State University',
        'ou': 'University of Oklahoma',
        'pfw': 'Purdue University Fort Wayne',
        'pitt': 'University of Pittsburgh',
        'pomona': 'Pomona College',
        'press': 'University of Michigan Press',
        'princeton': 'Princeton University',
        'private': 'Private Donor',
        'psu': 'Pennsylvania State University',
        'purdue': 'Purdue University',
        'quensu': 'Queen\'s University',
        'richmond': 'University of Richmond',
        'rochester': 'University of Rochester',
        'rutgers': 'Rutgers University',
        'scripscollege': 'Scripps College',
        'smith': 'Smith College',
        'smu': 'Southern Methodist University',
        'stanford': 'Stanford University',
        'swarthmore': 'Swarthmore College',
        'swmed': 'University of Texas Southwestern Medical Center',
        'syr': 'Syracuse University',
        'tamu': 'Texas A&M',
        'tcu': 'Texas Christian University',
        'technicalreports': 'Technical Report Archive and Image Library',
        'temple': 'Temple University',
        'ttu': 'Texas Tech University',
        'tufts': 'Tufts University',
        'tulane': 'Tulane University',
        'txstate': 'Texas State University - San Marcos',
        'ua': 'University of Alabama',
        'ualberta': 'University of Alberta',
        'ubc': 'University of British Columbia',
        'uc': 'University of Cincinnati',
        'ucalgary': 'University of Calgary',
        'ucdavis': 'University of California, Davis',
        'ucf': 'University of Central Florida',
        'uchicago': 'University of Chicago',
        'uci': 'University of California, Irvine',
        'ucla': 'University of California, Los Angeles',
        'ucm': 'Universidad Complutense de Madrid',
        'ucmerced': 'University of California, Merced',
        'uconn': 'University of Connecticut',
        'ucop': 'University of California, Office of the President',
        'ucr': 'University of California, Riverside',
        'ucsc': 'University of California, Santa Cruz',
        'ucsf': 'University of California, San Francisco',
        'udel': 'University of Delaware',
        'ufl': 'University of Florida',
        'uga': 'University of Georgia',
        'uh': 'University of Houston',
        'uic': 'University of Illinois at Chicago',
        'uiowa': 'University of Iowa',
        'uky': 'University of Kentucky',
        'um-dc-mp': 'University of Michigan, Duderstadt Center, Millennium Project',
        'umass': 'University of Massachusetts',
        'umd': 'University of Maryland',
        'umdl-umich': 'University of Michigan Library IT, Digital Library Production Service, Digital Conversion Unit',
        'umich': 'University of Michigan',
        'umn': 'University of Minnesota',
        'ump': 'University of Michigan Press',
        'unc': 'University of North Carolina',
        'ung': 'University of North Georgia',
        'union': 'Union College',
        'universityofcalifornia': 'University of California',
        'unl': 'University of Nebraska - Lincoln',
        'unlv': 'University of Nevada - Las Vegas',
        'unm': 'University of New Mexico',
        'unr': 'University of Nevada, Reno',
        'uoregon': 'University of Oregon',
        'upenn': 'University of Pennsylvania',
        'uq': 'The University of Queensland',
        'usc': 'University of Southern California',
        'usf': 'University of South Florida',
        'usg': 'University System of Georgia',
        'usu': 'Utah State University',
        'usup': 'Utah State University Press',
        'usupress': 'Utah State University Press',
        'uta': 'University of Texas at Arlington',
        'utah': 'University of Utah',
        'utdallas': 'University of Texas at Dallas',
        'utep': 'University of Texas at El Paso',
        'utexas': 'University of Texas at Austin',
        'uth': 'University of Texas Health Science Center at Houston',
        'utk': 'University of Tennessee, Knoxville',
        'utoronto': 'University of Toronto',
        'utsa': 'University of Texas at San Antonio',
        'uuhhs': 'Unitarian Universalist History and Heritage Society',
        'uvm': 'University of Vermont',
        'uwf': 'University of West Florida',
        'uwyo': 'University of Wyoming',
        'vanderbilt': 'Vanderbilt University',
        'vcu': 'Virginia Commonwealth University',
        'virginia': 'University of Virginia',
        'vt': 'Virginia Tech',
        'washington': 'University of Washington',
        'wau': 'University of Washington',
        'wayne': 'Wayne State University',
        'wesleyan': 'Wesleyan University',
        'wfu': 'Wake Forest University',
        'whitman': 'Whitman College',
        'wichita': 'Wichita State University',
        'williams': 'Williams College',
        'wisc': 'University of Wisconsin',
        'wsu': 'Washington State University',
        'wustl': 'Washington University in St. Louis',
        'wvu': 'West Virginia University',
        'yale': 'Yale University',
        'yale2': 'Yale University'
    }

    identifierFields = [
        ('hathi', 'bib_key'),
        ('hathi', 'htid'),
        (None, 'source_id'),
        ('isbn', 'isbns'),
        ('issn', 'issns'),
        ('lccn', 'lccns'),
        ('oclc', 'oclcs')
    ]

    viafRoot = 'https://dev-platform.nypl.org/api/v0.1/research-now/viaf-lookup?queryName='  # noqa: E501

    corporateRoles = [
        'publisher', 'manufacturer', 'repository', 'digitizer',
        'responsible_organization'
    ]

    def __init__(self, ingestRecord, ingestDateTime=None):
        # Initialize with empty SFR data objects
        # self.ingest contains the source data
        self.work = WorkRecord()
        self.ingest = ingestRecord
        self.instance = InstanceRecord()
        self.item = Format()
        self.rights = Rights()
        self.modified = ingestDateTime
        logger.debug('Initializing empty HathiRecord object')

        # We need a fallback modified date if none is provided
        if self.modified is None:
            self.modified = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            logger.debug(
                'Assigning generated timestamp of {} to new record'.format(
                    self.modified
                )
            )
        elif type(self.modified) is datetime:
            self.modified = self.modified.strftime('%Y-%m-%d %H:%M:%S')

    def __repr__(self):
        return '<Hathi(title={})>'.format(self.work.title)

    def buildDataModel(self, countryCodes):
        logger.debug('Generating work record for bib record {}'.format(
            self.ingest['bib_key']
        ))

        # If we don't have a valid rights code, this means that the row has
        # been improperly formatted (generally fields out of order/misplaced)
        # Raise a warning but continue if this is found to be true
        if self.ingest['rights_statement'] not in HathiRecord.rightsReasons:
            raise DataError(
                '{} is malformed (columns missing or incorrect'.format(
                    self.ingest['htid']
                )
            )

        self.buildWork()

        logger.debug('Generating instance record for hathi record {}'.format(
            self.ingest['htid']
        ))
        self.buildInstance(countryCodes)

        logger.debug('Generating an item record for hathi record {}'.format(
            self.ingest['htid']
        ))
        self.buildItem()

        logger.debug('Generate a rights object for the associated rights statement {}'.format(
            self.ingest['rights']
        ))

        # Generate a stand-alone rights object that contains the hathi
        # generated rights information
        self.createRights()

        for agent in self.work.agents:
            self.getVIAF(agent)

        for instance in self.work.instances:
            for agent in instance.agents:
                self.getVIAF(agent)
            for item in instance.formats:
                for agent in item.agents:
                    self.getVIAF(agent)

    def buildWork(self):
        """Construct the SFR Work object from the Hathi data"""
        self.work.title = self.ingest['title']

        logger.info('Creating work record for {}'.format(self.work.title))
        # The primary identifier for this work is a HathiTrust bib reference
        self.work.primary_identifier = Identifier(
            type='hathi',
            identifier=self.ingest['bib_key'],
            weight=1
        )
        logger.debug('Setting primary_identifier to {}'.format(
            self.work.primary_identifier
        ))

        for idType, key in HathiRecord.identifierFields:
            logger.debug('Setting identifiers {}'.format(idType))
            self.parseIdentifiers(self.work, idType, key)

        # All government documents should be in the public_domain.
        self.parseGovDoc(self.ingest['gov_doc'], self.ingest['htid'])

        # The copyright date assigned to the work by HathiTrust
        self.work.addClassItem('dates', Date, **{
            'display_date': self.ingest['copyright_date'],
            'date_range': self.ingest['copyright_date'],
            'date_type': 'copyright_date'
        })
        logger.debug('Setting copyright date to {}'.format(
            self.ingest['copyright_date']
        ))

        try:
            self.parseAuthor(self.ingest['author'])
        except KeyError:
            logger.warning('No author associated with record {}'.format(
                self.work
            ))

    def buildInstance(self, countryCodes):
        """Constrict an instance record from the Hathi data provided. As
        structured Hathi trust data will always correspond to a single
        instance. A wok in Hathi can have multiple items, and this relationship
        is reflected in the data.

        We do not attempt to merge records at this phase, but will associated
        works and instances related by identifiers when stored in the database.
        """
        self.instance.title = self.ingest['title']
        self.instance.language = self.ingest['language']
        self.instance.volume = self.ingest['description']

        logger.info('Creating instance record for work {}'.format(self.work))

        self.parsePubPlace(self.ingest['pub_place'], countryCodes)

        for idType, key in HathiRecord.identifierFields:
            logger.debug('Setting identifiers {}'.format(idType))
            self.parseIdentifiers(self.instance, idType, key)

        self.instance.addClassItem('dates', Date, **{
            'display_date': self.ingest['copyright_date'],
            'date_range': self.ingest['copyright_date'],
            'date_type': 'copyright_date'
        })
        logger.debug('Setting copyright date to {}'.format(
            self.ingest['copyright_date']
        ))

        try:
            coverFetch = HathiCover(self.ingest['htid'])
            pageURL = coverFetch.getPageFromMETS()
            if pageURL is not None:
                logger.debug('Add cover image {} to instance'.format(pageURL))
                self.instance.addClassItem('links', Link, **{
                    'url': pageURL,
                    'media_type': 'image/jpeg',
                    'flags': {
                        'cover': True,
                        'temporary': True,
                    }
                })
        except Exception as err:
            logger.error('Unable to load cover for {}'.format(self.ingest['htid']))
            logger.debug(err)

        self.parsePubInfo(self.ingest['publisher_pub_date'])

        # Add instance to parent work
        self.work.instances.append(self.instance)

    def buildItem(self):
        """HathiTrust items also correspond to a single item, the digitzed
        version of the book being described. From this record we can derive two
        links, a link to the HathiTrust reader page and a page for a direct
        download of the PDF copy of the book.
        """
        self.item.source = 'hathitrust'
        self.item.content_type = 'ebook'
        self.item.modified = self.modified

        logger.info('Creating item record for instance {}'.format(
            self.instance
        ))

        logger.debug('Setting htid {} for item'.format(self.ingest['htid']))
        self.parseIdentifiers(self.item, 'hathi', 'htid')

        logger.debug(
            'Storing direct and download links based on htid {}'.format(
                self.ingest['htid']
            ))
        # The link to the external HathiTrust page
        self.item.addClassItem('links', Link, **{
            'url': 'https://babel.hathitrust.org/cgi/pt?id={}'.format(
                self.ingest['htid']
            ),
            'media_type': 'text/html',
            'flags': {
                'local': False,
                'download': False,
                'images': True,
                'ebook': True
            }
        })

        # The link to the direct PDF download
        if self.ingest['digitization_entity'].lower() != 'google':
            self.item.addClassItem('links', Link, **{
                'url': 'https://babel.hathitrust.org/cgi/imgsrv/download/pdf?id={}'.format(self.ingest['htid']),
                'media_type': 'application/pdf',
                'flags': {
                    'local': False,
                    'download': True,
                    'images': True,
                    'ebook': True
                }
            })

        logger.debug('Storing repository {} as agent'.format(
            self.ingest['provider_entity']
        ))
        self.item.addClassItem('agents', Agent, **{
            'name': HathiRecord.sourceCodes[self.ingest['provider_entity'].lower()],
            'roles': ['repository']
        })

        logger.debug('Storing organization {} as agent'.format(
            self.ingest['responsible_entity']
        ))
        self.item.addClassItem('agents', Agent, **{
            'name': HathiRecord.sourceCodes[self.ingest['responsible_entity'].lower()],
            'roles': ['responsible_organization']
        })

        logger.debug('Storing digitizer {} as agent'.format(
            self.ingest['digitization_entity']
        ))
        self.item.addClassItem('agents', Agent, **{
            'name': HathiRecord.sourceCodes[self.ingest['digitization_entity'].lower()],
            'roles': ['digitizer']
        })

        # Add item to parent instance
        self.instance.formats.append(self.item)

    def createRights(self):
        """HathiTrust contains a strong set of rights data per item, including
        license, statement and justification fields. As this metadata is
        applicable to all levels in the SFR model, constructing a stand-alone
        rights object is the best way to ensure that accurate rights data
        is assigned to the records extracted from HathiTrust.
        """

        logger.info('Creating new rights object for row {}'.format(
            self.ingest['htid']
        ))

        self.rights.source = 'hathi_trust'
        self.rights.license = HathiRecord.rightsValues[self.ingest['rights']]['license']
        self.rights.rights_statement = HathiRecord.rightsValues[self.ingest['rights']]['statement']
        self.rights.rights_reason = HathiRecord.rightsReasons[self.ingest['rights_statement']]

        self.rights.addClassItem('dates', Date, **{
            'display_date': self.ingest['rights_determination_date'],
            'date_range': self.ingest['rights_determination_date'],
            'date_type': 'determination_date'
        })

        self.rights.addClassItem('dates', Date, **{
            'display_date': self.ingest['copyright_date'],
            'date_range': self.ingest['copyright_date'],
            'date_type': 'copyright_date'
        })

        # At present these rights are assigned to all three levels in the SFR
        # model work, instance and item. While this data certainly pertains to
        # the instance and item records retrieved here, its relevance is
        # unclear for the work record. It will be possible to have conflicting
        # rights statements for works and instances
        self.work.rights = [self.rights]
        self.instance.rights = [self.rights]
        self.item.rights = [self.rights]

    def parseIdentifiers(self, record, idType, key):
        """Iterate identifiers, splitting multiple values and storing in
        the indicated record.
        """
        if key not in self.ingest:
            logger.warning('{} not a valid type of identifier'.format(key))
            return
        idInstances = self.ingest[key].split(',')
        if len(idInstances) >= 1 and idInstances[0] != '':
            for typeInst in idInstances:
                logger.debug('Storing identifier {} ({}) for {}'.format(
                    typeInst,
                    idType,
                    record
                ))
                record.addClassItem('identifiers', Identifier, **{
                    'type': idType,
                    'identifier': typeInst.strip(),
                    'weight': 1
                })

    def parseAuthor(self, authorStr):
        """Hathi data files include an author column that combines author name
        with their birth and death dates (sometimes). This method parses
        those dates from the name and assigns them as Date objects to the
        constructed agent record. This record is then assigned to the work.
        """
        logger.info('Storing author {} for work {}'.format(
            authorStr,
            self.work
        ))
        authorDateGroup = re.search(r'([0-9\-c?\'.]{4,})', authorStr)
        authorDates = None
        if authorDateGroup is not None:
            authorDates = authorDateGroup.group(1)
            authorName = authorStr.replace(authorDates, '').strip(' ,.')
            logger.debug('Found lifespan dates {}'.format(authorDates))
        else:
            authorName = authorStr
            logger.debug('Found no lifespan dates')

        authorRec = Agent(
            name=authorName,
            role='author'
        )

        if authorDates is not None:
            logger.info('Creating date objects for author lifespan')
            lifespan = authorDates.strip(' ,.').split('-')
            if len(lifespan) == 1:
                logger.debug('Found single date, default to death_date')
                dateType = 'death_date'
                datePrefix = re.search(r' b(?: |\.)', authorStr)
                if datePrefix is not None:
                    authorRec.name = re.sub(
                        r' b(?: |\.|$)', '',
                        authorName
                    ).strip(' ,.')
                    logger.debug('Detected single birth_date (living author)')
                    dateType = 'birth_date'

                logger.debug('Storing single date {} of type {}'.format(
                    lifespan[0],
                    dateType
                ))
                authorRec.addClassItem('dates', Date, **{
                    'display_date': lifespan[0],
                    'date_range': lifespan[0],
                    'date_type': dateType
                })

            else:
                logger.debug('Storing lifespan {}-{} as dates'.format(
                    lifespan[0],
                    lifespan[1]
                ))
                authorRec.addClassItem('dates', Date, **{
                    'display_date': lifespan[0],
                    'date_range': lifespan[0],
                    'date_type': 'birth_date'
                })
                authorRec.addClassItem('dates', Date, **{
                    'display_date': lifespan[1],
                    'date_range': lifespan[1],
                    'date_type': 'death_date'
                })
        logger.debug('Appending agent record {} to work'.format(authorRec))

        self.work.agents.append(authorRec)

    def getVIAF(self, agent):
        logger.info('Querying VIAF for {}'.format(agent.name))
        reqStr = '{}{}'.format(
            self.viafRoot, quote_plus(agent.name)
        )
        if (len(list(set(agent.roles) & set(self.corporateRoles))) > 0):
            reqStr = '{}&queryType=corporate'.format(reqStr)
        viafResp = requests.get(reqStr)
        responseJSON = viafResp.json()
        logger.debug(responseJSON)
        if 'viaf' in responseJSON:
            logger.debug('Found VIAF {} for agent'.format(
                responseJSON.get('viaf', None)
            ))
            if responseJSON['name'] != agent.name:
                if agent.name not in agent.aliases:
                    agent.aliases.append(agent.name)
                agent.name = responseJSON.get('name', '')
            agent.viaf = responseJSON.get('viaf', None)
            agent.lcnaf = responseJSON.get('lcnaf', None)

    def parsePubPlace(self, pubPlace, countryCodes):
        """Attempt to load a country/state name from the countryCodes list
        If not found simply include the code as the publication place
        NOTE: If this occurs frequently check the MARC site for an updated
        list and issue a pull request to replace the XML included here.
        """
        try:
            self.instance.pub_place = countryCodes[pubPlace.strip()]
            logger.debug('Setting decoded pub_place to {}'.format(
                self.instance.pub_place
            ))
        except KeyError:
            self.instance.pub_place = pubPlace.strip()
            logger.warning('Failed to decode pub_place code, setting to raw code {}'.format(self.instance.pub_place))

    def parsePubInfo(self, imprintInfo):
        """Similar to authors 'imprint' or publication info is combined into
        a single column. This extracts the date and attempts to clean up
        any trailing punctuation left over from this operation.
        """
        logger.info('Storing publication {} info for instance {}'.format(
            imprintInfo,
            self.instance
        ))
        pubDateGroup = re.search(r'([0-9\-c?\'.]{4,})', imprintInfo)
        if pubDateGroup is not None:
            pubDate = pubDateGroup.group(1).strip(' ,.')
            logger.debug('Storing publication date {}'.format(pubDate))
            self.instance.addClassItem('dates', Date, **{
                'display_date': pubDate,
                'date_range': pubDate,
                'date_type': 'publication_date'
            })
            imprintInfo = imprintInfo.replace(pubDate, '')

        imprintInfo = re.sub(r'[\W]{2,}$', '', imprintInfo)
        logger.debug('Storing publisher as agent {}'.format(imprintInfo))
        self.instance.addClassItem('agents', Agent, **{
            'name': imprintInfo,
            'roles': ['publisher']
        })

    def parseGovDoc(self, govDocStatus, sourceID):
        if str(govDocStatus).lower() in ['1', 't']:
            govDocStatus = True
        else:
            govDocStatus = False
        self.work.addClassItem('measurements', Measurement, **{
            'quantity': 'government_document',
            'value': int(govDocStatus),
            'weight': 1,
            'taken_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'source_id': sourceID
        })
        logger.debug('Storing gov_doc status to {}'.format(str(govDocStatus)))
예제 #4
0
 def test_format_setLink(self):
     testFormat = Format('ebook', 'test', 'now')
     testFlags = {'local': False, 'download': False, 'ebook': True}
     testFormat.setLink(**{'url': 'https://hello.hello', 'mediaType': 'text/html', 'flags': testFlags})
     self.assertTrue(testFormat.links[0].flags['ebook'])
예제 #5
0
 def test_format_create_with_link(self):
     testFormat = Format('ebook', Link('testing'), 'now')
     self.assertIsInstance(testFormat, Format)
     self.assertEqual(testFormat.links[0].url, 'testing')
예제 #6
0
 def test_format_create(self):
     testFormat = Format('ebook', 'http://test.test', 'now')
     self.assertIsInstance(testFormat, Format)
     self.assertEqual(testFormat.content_type, 'ebook')
예제 #7
0
 def test_format_create(self):
     itemTest = Format()
     self.assertIsInstance(itemTest, Format)
예제 #8
0
 def test_item_repr(self):
     itemTest = Format(contentType='text/html', source='test')
     self.assertEqual(str(itemTest), '<Item(type=text/html, source=test)>')
예제 #9
0
 def test_fromat_create_with_link_object(self):
     linkTest = Link(url='http://fake.com')
     itemTest = Format(link=linkTest)
     self.assertIsInstance(itemTest.links[0], Link)
     self.assertEqual(itemTest.links[0].url, 'http://fake.com')
예제 #10
0
 def test_format_create_with_link(self):
     itemTest = Format(link='http://fake.com')
     self.assertIsInstance(itemTest.links[0], Link)
     self.assertEqual(itemTest.links[0].url, 'http://fake.com')
예제 #11
0
class HathiRecord():
    """Class for constructing HathiTrust-based records in the SFR data model.
    This largely serves as a wrapper for classes imported from the SFR model,
    and includes functions that can build these up. It also contains several
    class-level lookup tables for codes/values provided in the Hathi CSV files.
    """

    # These codes are supplied by Hathi as the determination of an item's
    # rights status.
    rightsReasons = {
        'bib': 'bibliographically-dervied by automatic processes',
        'ncn': 'no printed copyright notice',
        'con': 'contractual agreement with copyright holder on file',
        'ddd': 'due diligence documentation on file',
        'man': 'manual access control override; see note for details',
        'pvt': 'private personal information visible',
        'ren': 'copyright renewal research was conducted',
        'nfi':
        'needs further investigation (copyright research partially complete)',
        'cdpp':
        'title page or verso contain copyright date and/or place of publication information not in bib record',
        'ipma': 'in-print and market availability research was conducted',
        'unp': 'unpublished work',
        'gfv': 'Google viewability set at VIEW_FULL',
        'crms':
        'derived from multiple reviews in the Copyright Review Management System',
        'add':
        'author death date research was conducted or notification was received from authoritative source',
        'exp':
        'expiration of copyright term for non-US work with corporate author',
        'del': 'deleted from the repository; see not for details',
        'gatt':
        'non-US public domain work restroted to in-copyright in the US by GATT',
        'supp': 'suppressed from view; see note for details'
    }

    # Decodes rights statements into full licenses (CreativeCommons links where
    # possible), and human-readable statements.
    rightsValues = {
        'pd': {
            'license': 'public_domain',
            'statement': 'Public Domain'
        },
        'ic': {
            'license': 'in_copyright',
            'statement': 'In Copyright'
        },
        'op': {
            'license': 'in_copyright (out_of_print)',
            'statement': 'Out of Print (implied to be in copyright)'
        },
        'orph': {
            'license': 'in_copyright (orphaned)',
            'statement': 'Copyright Orphaned (implied to be in copyright)'
        },
        'und': {
            'license': 'undetermined',
            'statement': 'Status Undetermined'
        },
        'ic-world': {
            'license': 'in_copyright (viewable)',
            'statement': 'In Copyright, permitted to be world viewable'
        },
        'nobody': {
            'license': 'in_copyright (blocked)',
            'statement': 'Blocked for all users'
        },
        'pdus': {
            'license': 'public_domain (us_only)',
            'statement': 'Public Domain when viewed in the US'
        },
        'cc-by-3.0': {
            'license': 'https://creativecommons.org/licenses/by/3.0/',
            'statement': 'Creative Commons Attribution License, 3.0 Unported'
        },
        'cc-by-nd-3.0': {
            'license':
            'https://creativecommons.org/licenses/by-nd/3.0/',
            'statement':
            'Creative Commons Attribution, No Derivatives License, 3.0 Unported'
        },
        'cc-by-nc-nd-3.0': {
            'license':
            'https://creativecommons.org/licenses/by-nc-nd/3.0/',
            'statement':
            'Creative Commons Attribution, Non-Commercial, Share Alike License, 3.0 Unported'
        },
        'cc-by-sa-3.0': {
            'license':
            'https://creativecommons.org/licenses/by-sa/3.0/',
            'statement':
            'Creative Commons Attribution, Share Alike License, 3.0 Unported'
        },
        'orphcand': {
            'license': 'in_copyright (90-day hold)',
            'statement': 'Orphan Candidate - in 90-day holding period'
        },
        'cc-zero': {
            'license': 'https://creativecommons.org/publicdomain/zero/1.0/',
            'statement': 'Creative Commons Universal Public Domain'
        },
        'und-world': {
            'license': 'undetermined',
            'statement': 'Copyright Status undetermined, world viewable'
        },
        'icus': {
            'license': 'in_copyright (in US)',
            'statement': 'In Copyright in the US'
        },
        'cc-by-4.0': {
            'license': 'https://creativecommons.org/licenses/by/4.0/',
            'statement':
            'Creative Commons Attribution 4.0 International License'
        },
        'cc-by-nd-4.0': {
            'license':
            'https://creativecommons.org/licenses/by-nd/4.0/',
            'statement':
            'Creative Commons Attribution, No Derivatives 4.0 International License'
        },
        'cc-by-nc-nd-4.0': {
            'license':
            'https://creativecommons.org/licenses/by-nc-nd/4.0/',
            'statement':
            'Creative Commons Attribution, Non-Commercial, No Derivatives 4.0 International License'
        },
        'cc-by-nc-4.0': {
            'license':
            'https://creativecommons.org/licenses/by-nc/4.0/',
            'statement':
            'Creative Commons Attribution, Non-Commercial 4.0 International License'
        },
        'cc-by-nc-sa-4.0': {
            'license':
            'https://creativecommons.org/licenses/by-nc-sa/4.0/',
            'statement':
            'Creative Commons Attribution, Non-Commercial, Share Alike 4.0 International License'
        },
        'cc-by-sa-4.0': {
            'license':
            'https://creativecommons.org/licenses/by-sa/4.0/',
            'statement':
            'Creative Commons Attribution, Share Alike 4.0 International License'
        },
        'pd-pvt': {
            'license': 'public_domain (privacy_limited)',
            'statement': 'Public Domain access limited for privacy concerns'
        },
        'supp': {
            'license': 'suppressed',
            'statement': 'Suppressed from view'
        }
    }

    # List of institution codes for organizations that have contributed
    # materials to HathiTrust
    sourceCodes = {
        'aub': 'American University of Beirut',
        'bc': 'Boston College',
        'columbia': 'Columbia University',
        'cornell': 'Cornell University',
        'duke': 'Duke University',
        'emory': 'Emory University',
        'flbog': 'State University System of Florida',
        'getty': 'Getty Research Institute',
        'harvard': 'Harvard University',
        'hathitrust': 'HathiTrust',
        'illinois': 'University of Illinois at Urbana-Champaign',
        'iu': 'Indiana University',
        'loc': 'Library of Congress',
        'mcgill': 'McGill University',
        'missouri': 'University of Missouri-Columbia',
        'msu': 'Michigan State University',
        'ncsu': 'North Carolina State University',
        'nd': 'University of Notre Dame',
        'northwestern': 'Northwestern University',
        'nypl': 'New York Public Library',
        'osu': 'The Ohio State University',
        'princeton': 'Princeton University',
        'psu': 'Pennsylvania State University',
        'purdue': 'Purdue University',
        'tamu': 'Texas A&M',
        'tufts': 'Tufts University',
        'ualberta': 'University of Alberta',
        'uchicago': 'University of Chicago',
        'ucm': 'University of California, Merced',
        'uconn': 'University of Connecticut',
        'udel': 'University of Delaware',
        'uiowa': 'University of Iowa',
        'umass': 'University of Massachusetts',
        'umd': 'University of Maryland',
        'umich': 'University of Michigan',
        'umn': 'University of Minnesota',
        'unc': 'University of North Carolina',
        'universityofcalifornia': 'University of California',
        'upenn': 'University of Pennsylvania',
        'uq': 'The University of Queensland',
        'usu': 'Utah State University',
        'utexas': 'University of Texas',
        'virginia': 'University of Virginia',
        'washington': 'University of Washington',
        'wfu': 'Wake Forest University',
        'wisc': 'University of Wisconsin',
        'yale': 'Yale University',
    }

    # List of organizations that have digitized materials in HathiTrust
    digCodes = {
        'google': 'Google',
        'lit-dlps-dc':
        'Library IT, Digital Library Production Service, Digital Conversion',
        'ump': 'University of Michigan Press',
        'ia': 'Internet Archive',
        'yale': 'Yale University',
        'mdl': 'Minnesota Digital Library',
        'mhs': 'Minnesota Historical Society',
        'usup': 'Utah State University Press',
        'ucm': 'Universidad Complutense de Madrid',
        'purd': 'Purdue University',
        'getty': 'Getty Research Institute',
        'um-dc-mp':
        'University of Michigan, Duderstadt Center, Millennium Project',
        'uiuc': 'University of Illinois at Urbana-Champaign',
        'illinois': 'University of Illinois at Urbana-Champaign',
        'brooklynmuseum': 'Brooklyn Museum',
        'uf': 'State University of Florida',
        'tamu': 'Texas A&M',
        'udel': 'University of Delaware',
        'private': 'Private Donor',
        'umich': 'University of Michigan',
        'clark': 'Clark Art Institute',
        'ku': 'Knowledge Unlatched',
        'mcgill': 'McGill University',
        'bc': 'Boston College',
        'nnc': 'Columbia University',
        'geu': 'Emory University',
        'yale2': 'Yale University',
        'mou': 'University of Missouri-Columbia',
        'chtanc': 'National Central Library of Taiwan',
        'bentley-umich': 'Bentley Historical Library, University of Michigan',
        'clements-umich':
        'William L. Clements Library, University of Michigan',
        'wau': 'University of Washington',
        'cornell': 'Cornell University',
        'cornell-ms': 'Cornell University/Microsoft',
        'umd': 'University of Maryland',
        'frick': 'The Frick Collection',
        'northwestern': 'Northwestern University',
        'umn': 'University of Minnesota',
        'berkeley': 'University of California, Berkeley',
        'ucmerced': 'University of California, Merced',
        'nd': 'University of Notre Dame',
        'princeton': 'Princeton University',
        'uq': 'The University of Queensland',
        'ucla': 'University of California, Los Angeles',
        'osu': 'The Ohio State University',
        'upenn': 'University of Pennsylvania',
        'aub': 'American University of Beirut',
        'ucsd': 'University of California, San Diego',
        'harvard': 'Harvard University',
        'borndigital': None,
    }

    identifierFields = [('hathi', 'bib_key'), ('hathi', 'htid'),
                        (None, 'source_id'), ('isbn', 'isbns'),
                        ('issn', 'issns'), ('lccn', 'lccns'),
                        ('oclc', 'oclcs')]

    def __init__(self, ingestRecord, ingestDateTime=None):
        # Initialize with empty SFR data objects
        # self.ingest contains the source data
        self.work = WorkRecord()
        self.ingest = ingestRecord
        self.instance = InstanceRecord()
        self.item = Format()
        self.rights = Rights()
        self.modified = ingestDateTime
        logger.debug('Initializing empty HathiRecord object')

        # We need a fallback modified date if none is provided
        if self.modified is None:
            self.modified = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            logger.debug(
                'Assigning generated timestamp of {} to new record'.format(
                    self.modified))
        elif type(self.modified) is datetime:
            self.modified = self.modified.strftime('%Y-%m-%d %H:%M:%S')

    def __repr__(self):
        return '<Hathi(title={})>'.format(self.work.title)

    def buildDataModel(self, countryCodes):
        logger.debug('Generating work record for bib record {}'.format(
            self.ingest['bib_key']))
        self.buildWork()

        logger.debug('Generating instance record for hathi record {}'.format(
            self.ingest['htid']))
        self.buildInstance(countryCodes)

        logger.debug('Generating an item record for hathi record {}'.format(
            self.ingest['htid']))
        self.buildItem()

        logger.debug(
            'Generate a rights object for the associated rights statement {}'.
            format(self.ingest['rights']))

        # Generate a stand-alone rights object that contains the hathi
        # generated rights information
        self.createRights()

    def buildWork(self):
        """Construct the SFR Work object from the Hathi data"""
        self.work.title = self.ingest['title']
        self.work.series = self.ingest['description']
        logger.info('Creating work record for {}'.format(self.work.title))
        # The primary identifier for this work is a HathiTrust bib reference
        self.work.primary_identifier = Identifier(
            type='hathi', identifier=self.ingest['bib_key'], weight=1)
        logger.debug('Setting primary_identifier to {}'.format(
            self.work.primary_identifier))

        for idType, key in HathiRecord.identifierFields:
            logger.debug('Setting identifiers {}'.format(idType))
            self.parseIdentifiers(self.work, idType, key)

        # All government documents should be in the public_domain.
        self.parseGovDoc(self.ingest['gov_doc'])

        # The copyright date assigned to the work by HathiTrust
        self.work.addClassItem(
            'dates', Date, **{
                'display_date': self.ingest['copyright_date'],
                'date_range': self.ingest['copyright_date'],
                'date_type': 'copyright_date'
            })
        logger.debug('Setting copyright date to {}'.format(
            self.ingest['copyright_date']))

        self.parseAuthor(self.ingest['author'])

    def buildInstance(self, countryCodes):
        """Constrict an instance record from the Hathi data provided. As
        structured Hathi trust data will always correspond to a single
        instance. A wok in Hathi can have multiple items, and this relationship
        is reflected in the data.

        We do not attempt to merge records at this phase, but will associated
        works and instances related by identifiers when stored in the database.
        """
        self.instance.title = self.ingest['title']
        self.instance.language = self.ingest['language']

        logger.info('Creating instance record for work {}'.format(self.work))

        self.parsePubPlace(self.ingest['pub_place'], countryCodes)

        for idType, key in HathiRecord.identifierFields:
            logger.debug('Setting identifiers {}'.format(idType))
            self.parseIdentifiers(self.instance, idType, key)

        self.instance.addClassItem(
            'dates', Date, **{
                'display_date': self.ingest['copyright_date'],
                'date_range': self.ingest['copyright_date'],
                'date_type': 'copyright_date'
            })
        logger.debug('Setting copyright date to {}'.format(
            self.ingest['copyright_date']))

        self.parsePubInfo(self.ingest['publisher_pub_date'])

        # Add instance to parent work
        self.work.instances.append(self.instance)

    def buildItem(self):
        """HathiTrust items also correspond to a single item, the digitzed
        version of the book being described. From this record we can derive two
        links, a link to the HathiTrust reader page and a page for a direct
        download of the PDF copy of the book.
        """
        self.item.source = 'hathitrust'
        self.item.content_type = 'ebook'
        self.item.modified = self.modified

        logger.info('Creating item record for instance {}'.format(
            self.instance))

        logger.debug('Setting htid {} for item'.format(self.ingest['htid']))
        self.parseIdentifiers(self.item, 'hathi', 'htid')

        logger.debug(
            'Storing direct and download links based on htid {}'.format(
                self.ingest['htid']))
        # The link to the external HathiTrust page
        self.item.addClassItem(
            'links', Link, **{
                'url':
                'https://babel.hathitrust.org/cgi/pt?id={}'.format(
                    self.ingest['htid']),
                'media_type':
                'text/html',
                'rel_type':
                'external_view'
            })

        # The link to the direct PDF download
        self.item.addClassItem(
            'links', Link, **{
                'url':
                'https://babel.hathitrust.org/cgi/imgsrv/download/pdf?id={}'.
                format(self.ingest['htid']),
                'media_type':
                'application/pdf',
                'rel_type':
                'pdf_download'
            })

        logger.debug('Storing repository {} as agent'.format(
            self.ingest['source']))
        self.item.addClassItem(
            'agents', Agent, **{
                'name': self.ingest['source'],
                'roles': ['repository']
            })

        logger.debug('Storing organization {} as agent'.format(
            self.ingest['responsible_entity']))
        self.item.addClassItem(
            'agents', Agent, **{
                'name':
                HathiRecord.sourceCodes[self.ingest['responsible_entity']],
                'roles': ['responsible_organization']
            })

        logger.debug('Storing digitizer {} as agent'.format(
            self.ingest['digitization_entity']))
        self.item.addClassItem(
            'agents', Agent, **{
                'name':
                HathiRecord.digCodes[self.ingest['digitization_entity']],
                'roles': ['digitizer']
            })

        # Add item to parent instance
        self.instance.formats.append(self.item)

    def createRights(self):
        """HathiTrust contains a strong set of rights data per item, including
        license, statement and justification fields. As this metadata is
        applicable to all levels in the SFR model, constructing a stand-alone
        rights object is the best way to ensure that accurate rights data
        is assigned to the records extracted from HathiTrust.
        """

        logger.info('Creating new rights object for row {}'.format(
            self.ingest['htid']))

        self.rights.source = 'hathi_trust'
        self.rights.license = HathiRecord.rightsValues[
            self.ingest['rights']]['license']
        self.rights.rights_statement = HathiRecord.rightsValues[
            self.ingest['rights']]['statement']
        self.rights.rights_reason = HathiRecord.rightsReasons[
            self.ingest['rights_statement']]

        self.rights.addClassItem(
            'dates', Date, **{
                'display_date': self.ingest['rights_determination_date'],
                'date_range': self.ingest['rights_determination_date'],
                'date_type': 'determination_date'
            })

        self.rights.addClassItem(
            'dates', Date, **{
                'display_date': self.ingest['copyright_date'],
                'date_range': self.ingest['copyright_date'],
                'date_type': 'copyright_date'
            })

        # At present these rights are assigned to all three levels in the SFR
        # model work, instance and item. While this data certainly pertains to
        # the instance and item records retrieved here, its relevance is
        # unclear for the work record. It will be possible to have conflicting
        # rights statements for works and instances
        self.work.rights = [self.rights]
        self.instance.rights = [self.rights]
        self.item.rights = [self.rights]

    def parseIdentifiers(self, record, idType, key):
        """Iterate identifiers, splitting multiple values and storing in
        the indicated record.
        """
        if key not in self.ingest:
            logger.warning('{} not a valid type of identifier'.format(key))
            return
        idInstances = self.ingest[key].split(',')
        if len(idInstances) >= 1 and idInstances[0] != '':
            for typeInst in idInstances:
                logger.debug('Storing identifier {} ({}) for {}'.format(
                    typeInst, idType, record))
                record.addClassItem(
                    'identifiers', Identifier, **{
                        'type': idType,
                        'identifier': typeInst.strip(),
                        'weight': 1
                    })

    def parseAuthor(self, authorStr):
        """Hathi data files include an author column that combines author name
        with their birth and death dates (sometimes). This method parses
        those dates from the name and assigns them as Date objects to the
        constructed agent record. This record is then assigned to the work.
        """
        logger.info('Storing author {} for work {}'.format(
            authorStr, self.work))
        authorDateGroup = re.search(r'([0-9\-c?\'.]{4,})', authorStr)
        authorDates = None
        if authorDateGroup is not None:
            authorDates = authorDateGroup.group(1)
            authorName = authorStr.replace(authorDates, '').strip(' ,.')
            logger.debug('Found lifespan dates {}'.format(authorDates))
        else:
            authorName = authorStr
            logger.debug('Found no lifespan dates')

        authorRec = Agent(name=authorName, role='author')

        if authorDates is not None:
            logger.info('Creating date objects for author lifespan')
            lifespan = authorDates.strip(' ,.').split('-')
            if len(lifespan) == 1:
                logger.debug('Found single date, default to death_date')
                dateType = 'death_date'
                datePrefix = re.search(r' b(?: |\.)', authorStr)
                if datePrefix is not None:
                    authorRec.name = re.sub(r' b(?: |\.|$)', '',
                                            authorName).strip(' ,.')
                    logger.debug('Detected single birth_date (living author)')
                    dateType = 'birth_date'

                logger.debug('Storing single date {} of type {}'.format(
                    lifespan[0], dateType))
                authorRec.addClassItem(
                    'dates', Date, **{
                        'display_date': lifespan[0],
                        'date_range': lifespan[0],
                        'date_type': dateType
                    })

            else:
                logger.debug('Storing lifespan {}-{} as dates'.format(
                    lifespan[0], lifespan[1]))
                authorRec.addClassItem(
                    'dates', Date, **{
                        'display_date': lifespan[0],
                        'date_range': lifespan[0],
                        'date_type': 'birth_date'
                    })
                authorRec.addClassItem(
                    'dates', Date, **{
                        'display_date': lifespan[1],
                        'date_range': lifespan[1],
                        'date_type': 'death_date'
                    })
        logger.debug('Appending agent record {} to work'.format(authorRec))
        self.work.agents.append(authorRec)

    def parsePubPlace(self, pubPlace, countryCodes):
        """Attempt to load a country/state name from the countryCodes list
        If not found simply include the code as the publication place
        NOTE: If this occurs frequently check the MARC site for an updated
        list and issue a pull request to replace the XML included here.
        """
        try:
            self.instance.pub_place = countryCodes[pubPlace.strip()]
            logger.debug('Setting decoded pub_place to {}'.format(
                self.instance.pub_place))
        except KeyError:
            self.instance.pub_place = pubPlace.strip()
            logger.warning(
                'Failed to decode pub_place code, setting to raw code {}'.
                format(self.instance.pub_place))

    def parsePubInfo(self, imprintInfo):
        """Similar to authors 'imprint' or publication info is combined into
        a single column. This extracts the date and attempts to clean up
        any trailing punctuation left over from this operation.
        """
        logger.info('Storing publication {} info for instance {}'.format(
            imprintInfo, self.instance))
        pubDateGroup = re.search(r'([0-9\-c?\'.]{4,})', imprintInfo)
        if pubDateGroup is not None:
            pubDate = pubDateGroup.group(1).strip(' ,.')
            logger.debug('Storing publication date {}'.format(pubDate))
            self.instance.addClassItem(
                'dates', Date, **{
                    'display_date': pubDate,
                    'date_range': pubDate,
                    'date_type': 'publication_date'
                })
            imprintInfo = imprintInfo.replace(pubDate, '')

        imprintInfo = re.sub(r'[\W]{2,}$', '', imprintInfo)
        logger.debug('Storing publisher as agent {}'.format(imprintInfo))
        self.instance.addClassItem(
            'agents', Agent, **{
                'name': imprintInfo,
                'roles': ['publisher']
            })

    def parseGovDoc(self, govDocStatus):
        if str(govDocStatus).lower() in ['1', 't']:
            govDocStatus = True
        else:
            govDocStatus = False
        self.work.addClassItem(
            'measurements', Measurement, **{
                'quantity': 'government_document',
                'value': int(govDocStatus),
                'weight': 1,
                'taken_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            })
        logger.debug('Storing gov_doc status to {}'.format(str(govDocStatus)))
예제 #12
0
 def test_format_setLink(self):
     testFormat = Format('ebook', 'test', 'now')
     testFormat.setLink(**{'url': 'https://hello.hello', 'mediaType': 'text/html', 'relType': 'reference'})
     self.assertEqual(testFormat.links[0].relType, 'reference')