Пример #1
0
            if record.metadata.get_element('.//{0}type'.format(dc)):
                sourceResource['type'] = record.metadata.get_element(
                    './/{0}type'.format(dc), delimiter=';')

            # webResource.fileFormat

            # aggregation.dataProvider
            data_provider = dprovide

            # aggregation.intermediateProvider

            # aggregation.isShownAt

            # aggregation.preview
            try:
                preview = assets.thumbnail_service(PURL_match, tn)
            except UnboundLocalError as err:
                # logging.warning('aggregation.preview: {0} - {1}'.format(err, oai_id))
                print(err, oai_id)
                continue

            # aggregation.provider

            try:
                docs.append({
                    "@context": "http://api.dp.la/items/context",
                    "sourceResource": sourceResource,
                    "aggregatedCHO": "#sourceResource",
                    "dataProvider": data_provider,
                    "isShownAt": PURL_match,
                    "preview": preview,
Пример #2
0
def SSDN_QDC(file_in, tn, dprovide, iprovide=None):
    with open(file_in, encoding='utf-8') as data_in:
        logger = assets.CSVLogger('SSDN_QDC', provider=dprovide)
        records = OAIReader(data_in)
        docs = []
        for record in records:

            # deleted record handling for repox
            try:
                if 'deleted' in record.attrib.keys():
                    if record.attrib['deleted'] == 'true':
                        continue
            except AttributeError:
                pass

            # deleted record handling for OAI-PMH
            try:
                if 'status' in record.find('./{*}header').attrib.keys():
                    if record.find(
                            './{*}header').attrib['status'] == 'deleted':
                        continue
            except AttributeError:
                pass

            oai_id = record.oai_urn

            if VERBOSE:
                print(oai_id)
            logger.debug(oai_id)
            sourceResource = {}

            # sourceResource.alternative
            alt_title = record.metadata.get_element(
                './/{0}alternative'.format(dcterms))
            if alt_title:
                sourceResource['alternative'] = alt_title

            # sourceResource.collection
            if record.metadata.get_element('.//{0}isPartOf'.format(dcterms)):
                sourceResource['collection'] = record.metadata.get_element(
                    './/{0}isPartOf'.format(dcterms))

            # sourceResource.contributor
            if record.metadata.get_element('.//{0}contributor'.format(dc)):
                sourceResource['contributor'] = [{
                    "name": name
                } for name in record.metadata.get_element(
                    './/{0}contributor'.format(dc), delimiter=';')]

            # sourceResource.creator
            if record.metadata.get_element('.//{0}creator'.format(dc)):
                sourceResource['creator'] = []
                for name in record.metadata.get_element(
                        './/{0}creator'.format(dc), delimiter=';'):
                    # need to test for ( Contributor ) and ( contributor )
                    if (len(name) > 0) and ("ontributor )" not in name):
                        sourceResource['creator'].append(
                            {"name": name.strip(" ")})
                    elif "ontributor )" in name:
                        if 'contributor' not in sourceResource.keys():
                            sourceResource['contributor'] = []
                            sourceResource['contributor'].append({
                                "name":
                                name.strip(" ").rstrip("( Contributor )").
                                rstrip("( contributor )")
                            })
                        else:
                            sourceResource['contributor'].append({
                                "name":
                                name.strip(" ").rstrip("( Contributor )").
                                rstrip("( contributor )")
                            })

            # sourceResource.date
            date = record.metadata.get_element('.//{0}created'.format(dcterms))
            if date is None:  # TODO: there has to be a better way to do this
                date = record.metadata.get_element(
                    './/{0}issued'.format(dcterms))
            if date is None:
                date = record.metadata.get_element(
                    './/{0}date'.format(dcterms))
            if date is None:
                date = record.metadata.get_element('.//{0}date'.format(dc))
            if date is None:
                date = record.metadata.get_element(
                    './/{0}available'.format(dcterms))
            if date is None:
                date = record.metadata.get_element(
                    './/{0}dateAccepted'.format(dcterms))
            if date is None:
                date = record.metadata.get_element(
                    './/{0}dateCopyrighted'.format(dcterms))
            if date is None:
                date = record.metadata.get_element(
                    './/{0}dateSubmitted'.format(dcterms))

            if date is not None:
                sourceResource['date'] = {
                    "begin": date[0],
                    "end": date[0],
                    "displayDate": date[0]
                }

            # sourceResource.description
            description = []
            if record.metadata.get_element(
                    './/{0}description'.format(dc)) is not None:
                for item in record.metadata.get_element(
                        './/{0}description'.format(dc)):
                    description.append(item)
            if record.metadata.get_element(
                    './/{0}abstract'.format(dcterms)) is not None:
                for item in record.metadata.get_element(
                        './/{0}abstract'.format(dcterms)):
                    description.append(item)
            if description:
                sourceResource['description'] = description

            # sourceResource.extent
            if record.metadata.get_element('.//{0}extent'.format(dcterms)):
                sourceResource['extent'] = record.metadata.get_element(
                    './/{0}extent'.format(dcterms), delimiter=';')

            # sourceResource.format
            if record.metadata.get_element('.//{0}medium'.format(dcterms)):
                sourceResource['format'] = []
                for element in record.metadata.get_element(
                        './/{0}medium'.format(dcterms), delimiter=';'):
                    if element.lower() in IANA_type_list:
                        file_format = element.lower()
                        pass
                    elif len(element) > 0:
                        sourceResource['format'].append(
                            {'name': element.strip(' ')})
                if len(sourceResource['format']) == 0:
                    del sourceResource['format']

            # sourceResource.genre

            # sourceResource.identifier
            sourceResource['identifier'] = oai_id

            # sourceResource.language
            if record.metadata.get_element('.//{0}language'.format(dc)):
                sourceResource['language'] = []
                for element in record.metadata.get_element(
                        './/{0}language'.format(dc), delimiter=';'):
                    if len(element) > 3:
                        sourceResource['language'].append({"name": element})
                    else:
                        sourceResource['language'].append(
                            {"iso_639_3": element})

            # sourceResource.place : sourceResource['spatial']
            if record.metadata.get_element('.//{0}spatial'.format(dcterms)):
                for place in record.metadata.get_element(
                        './/{0}spatial'.format(dcterms), delimiter=';'):
                    try:
                        float(place)
                    except ValueError:
                        sourceResource['spatial'] = [place]

            # sourceResource.publisher
            publisher = record.metadata.get_element(
                './/{0}publisher'.format(dc))
            if publisher:
                sourceResource['publisher'] = publisher

            # sourceResource.relation

            # sourceResource.isReplacedBy

            # sourceResource.replaces

            # sourceResource.rights
            rights_uri = re.compile('http://rightsstatements')
            if record.metadata.get_element('.//{0}rights'.format(dc)):
                for rights_statement in record.metadata.get_element(
                        './/{0}rights'.format(dc)):
                    uri = rights_uri.search(rights_statement)
                    if uri:
                        sourceResource['rights'] = [{
                            "@id": uri.string.strip()
                        }]
                        break
                    else:
                        sourceResource['rights'] = [{
                            "text":
                            rights_statement.strip()
                        }]

            else:
                logger.error('No sourceResource.rights - {0}'.format(oai_id))
                continue

            # sourceResource.subject
            if record.metadata.get_element('.//{0}subject'.format(dc)):
                sourceResource['subject'] = [{
                    "name": name
                } for name in record.metadata.get_element(
                    './/{0}subject'.format(dc), delimiter=';')]

            # sourceResource.temporal
            temporal = record.metadata.get_element(
                './/{0}temporal'.format(dcterms))
            if temporal:
                sourceResource['temporal'] = temporal

            # sourceResource.title
            title = record.metadata.get_element('.//{0}title'.format(dc))
            if title is not None:
                sourceResource['title'] = title
            else:
                logger.error('No sourceResource.title - {0}'.format(oai_id))
                continue

            # sourceResource.type
            if record.metadata.get_element('.//{0}type'.format(dc)):
                sourceResource['type'] = record.metadata.get_element(
                    './/{0}type'.format(dc), delimiter=';')

            # webResource.fileFormat
            #  TODO: file_format kicked out of SR.genre

            # aggregation.dataProvider
            data_provider = dprovide

            # aggregation.intermediateProvider

            # aggregation.isShownAt

            # aggregation.preview
            preview = None
            for identifier in record.metadata.get_element(
                    './/{0}identifier'.format(dc)):
                if 'http' in identifier:
                    is_shown_at = identifier
                    preview = assets.thumbnail_service(identifier, tn)

            # aggregation.provider

            # build record
            try:
                if is_shown_at:
                    doc = assets.build(oai_id, sourceResource, data_provider,
                                       is_shown_at, preview, iprovide)

                docs.append(doc)
            except UnboundLocalError:
                logger.error('No aggregation.isShownAt - {0}'.format(oai_id))
                continue

    return docs
Пример #3
0
def SSDN_MODS(file_in, tn, dprovide, iprovide=None):
    with open(file_in, encoding='utf-8') as data_in:
        logger = assets.CSVLogger('SSDN_MODS', provider=dprovide)
        records = OAIReader(data_in)
        docs = []
        for record in records:

            # deleted record handling for repox
            try:
                if 'deleted' in record.attrib.keys():
                    if record.attrib['deleted'] == 'true':
                        pass
            except AttributeError:
                pass

            # deleted record handling for OAI-PMH
            try:
                if 'status' in record.find('./{*}header').attrib.keys():
                    if record.find(
                            './{*}header').attrib['status'] == 'deleted':
                        pass
            except AttributeError:
                pass

            if VERBOSE:
                print(record.oai_urn)
            logger.debug(record.oai_urn)
            sourceResource = {}

            if record.metadata is None:
                continue

            # sourceResource.alternative
            if len(record.metadata.titles) > 1:
                sourceResource['alternative'] = []
                if len(record.metadata.titles[1:]) >= 1:
                    for alternative_title in record.metadata.titles[1:]:
                        sourceResource['alternative'].append(alternative_title)

            # sourceResource.collection

            # sourceResource.contributor
            try:

                for name in record.metadata.names:
                    if name.role.text != 'Creator' and name.role.code != 'cre' and name.role.text is not None and name.role.code is not None:
                        sourceResource['contributor'] = [{
                            "@id": name.uri,
                            "name": name.text
                        } if name.uri else {
                            "name": name.text
                        }]
            except KeyError as err:
                logger.error('sourceResource.contributor: {0}, {1}'.format(
                    err, record.oai_urn))
                pass

            # sourceResource.creator
            name_list = []
            if record.metadata.get_creators:
                for name in record.metadata.get_creators:
                    name_list.append(name)
            if record.metadata.names:
                for name in record.metadata.names:
                    if name.role.text is None or name.role.code is None:
                        name_list.append(name)
            sourceResource['creator'] = [{
                "@id": name.uri,
                "name": name.text
            } if name.uri else {
                "name": name.text
            } for name in name_list]

            # sourceResource.date
            if record.metadata.dates:
                date = record.metadata.dates[0].text
                if ' - ' in date:
                    sourceResource['date'] = {
                        "displayDate": date,
                        "begin": date[0:4],
                        "end": date[-4:]
                    }
                else:
                    sourceResource['date'] = {
                        "displayDate": date,
                        "begin": date,
                        "end": date
                    }

            # sourceResource.description
            if record.metadata.abstract:
                sourceResource['description'] = [
                    abstract.text for abstract in record.metadata.abstract
                ]
            try:
                for toc in record.metadata.iterfind(
                        './/{http://www.loc.gov/mods/v3}tableOfContents'):
                    sourceResource['description'].append(toc.text)
            except KeyError:
                sourceResource['description'] = [
                    toc.text for toc in record.metadata.findall(
                        './/{http://www.loc.gov/mods/v3}tableOfContents')
                ]

            # sourceResource.extent
            if record.metadata.extent:
                sourceResource['extent'] = record.metadata.extent

            # sourceResource.format
            if record.metadata.genre:
                sourceResource['format'] = [{
                    'name': genre.text,
                    '@id': genre.uri
                } if genre.uri else {
                    'name': genre.text
                } for genre in record.metadata.genre]

            # sourceResource.identifier
            try:
                sourceResource['identifier'] = record.metadata.purl[0]
            except IndexError as err:
                logger.error('sourceResource.identifier: {0}, {1}'.format(
                    err, record.oai_urn))
                continue

            # sourceResource.language
            try:
                if record.metadata.language:
                    sourceResource['language'] = [{
                        "name": lang.text,
                        "iso_639_3": lang.code
                    } for lang in record.metadata.language]
            except AttributeError as err:
                logger.error('sourceResource.language: {0}, {1}'.format(
                    err, record.oai_urn))
                pass

            # sourceResource.place : sourceResource['spatial']
            for subject in record.metadata.subjects:
                for c in subject.elem.getchildren():
                    if 'eographic' in c.tag:
                        sourceResource['spatial'] = {"name": subject.text}

            # sourceResource.publisher
            if record.metadata.publisher:
                sourceResource['publisher'] = record.metadata.publisher

            # sourceResource.relation

            # sourceResource.isReplacedBy

            # sourceResource.replaces

            # sourceResource.rights
            if record.metadata.rights:
                sourceResource['rights'] = [{
                    "@id": rights.text
                } if "http://rightsstatements.org" in rights.text else {
                    "text": rights.text
                } for rights in record.metadata.rights[:2]]
                # slicing isn't ideal here since it depends on element order
            else:
                logger.error('No sourceResource.rights - {0}'.format(
                    record.oai_urn))
                continue

            # sourceResource.subject
            try:

                if record.metadata.subjects:
                    sourceResource['subject'] = []
                    for subject in record.metadata.subjects:
                        for child in subject.elem:
                            if 'eographic' not in child.tag:
                                sourceResource['subject'].append(
                                    {"name": subject.text})
            except (TypeError, IndexError) as err:
                logger.error('sourceResource.subject: {0}, {1}'.format(
                    err, record.oai_urn))
                pass

            # sourceResource.title
            if record.metadata.titles:
                sourceResource['title'] = [
                    '{}'.format(record.metadata.titles[0])
                ]
            else:
                logger.error('No sourceResource.title: {0}'.format(
                    record.oai_urn))
                continue

            # sourceResource.type
            sourceResource['type'] = record.metadata.type_of_resource

            # aggregation.dataProvider
            data_provider = dprovide

            # aggregation.intermediateProvider

            # aggregation.isShownAt

            # aggregation.preview
            preview = None
            pid = record.metadata.pid
            if pid is None:
                pid = record.oai_urn.split(':')[-1].replace('_', ':')
            preview = assets.thumbnail_service(pid, tn)

            # aggregation.provider

            # build record
            try:
                if record.metadata.purl[0]:
                    doc = assets.build(record.oai_urn, sourceResource,
                                       data_provider, record.metadata.purl[0],
                                       preview, iprovide)

                docs.append(doc)
            except UnboundLocalError:
                logger.error('No aggregation.isShownAt - {0}'.format(
                    record.oai_urn))
                continue

    return docs
Пример #4
0
def FlaLD_QDC(file_in, tn, dprovide, iprovide=None):
    with open(file_in, encoding='utf-8') as data_in:
        records = OAI_QDC(data_in)
        docs = []
        for record in records.record_list:

            if 'deleted' in record.attrib.keys():
                if record.attrib['deleted'] == 'true':
                    pass

            else:
                oai_id = record.attrib['id']

                sourceResource = {}

                # sourceResource.alternative
                alt_title = OAI_QDC.simple_lookup(record, './/{0}alternative'.format(nameSpace_default['dcterms']))
                if alt_title is not None:
                    sourceResource['alternative'] = alt_title

                # sourceResource.collection

                # sourceResource.contributor
                if OAI_QDC.simple_lookup(record, './/{0}contributor'.format(nameSpace_default['dc'])) is not None:
                    sourceResource['contributor'] = []
                    for element in OAI_QDC.split_lookup(record, './/{0}contributor'.format(nameSpace_default['dc'])):
                        for name in element:
                            if len(name) > 0:
                                sourceResource['contributor'].append({"name": name.strip(" ") })

                # sourceResource.creator
                if OAI_QDC.simple_lookup(record, './/{0}creator'.format(nameSpace_default['dc'])) is not None:
                    sourceResource['creator'] = []
                    for element in OAI_QDC.split_lookup(record, './/{0}creator'.format(nameSpace_default['dc'])):
                        for name in element:
                            if len(name) > 0:
                                sourceResource['creator'].append({"name": name.strip(" ") })

                # sourceResource.date
                date = OAI_QDC.simple_lookup(record, './/{0}created'.format(nameSpace_default['dcterms']))
                if date is not None:
                    sourceResource['date'] = { "begin": date[0], "end": date[0] }

                # sourceResource.description
                description = []
                if OAI_QDC.simple_lookup(record, './/{0}description'.format(nameSpace_default['dc'])) is not None:
                    for item in OAI_QDC.simple_lookup(record, './/{0}description'.format(nameSpace_default['dc'])):
                        description.append(item)
                if OAI_QDC.simple_lookup(record, './/{0}abstract'.format(nameSpace_default['dcterms'])) is not None:
                    for item in OAI_QDC.simple_lookup(record, './/{0}abstract'.format(nameSpace_default['dcterms'])):
                        description.append(item)
                if len(description) > 1:
                    sourceResource['description'] = []
                    for item in description:
                        sourceResource['description'].append(item)
                elif len(description) == 1:
                    sourceResource['description'] = description[0]

                # sourceResource.extent
                if OAI_QDC.simple_lookup(record, './/{0}extent'.format(nameSpace_default['dcterms'])) is not None:
                    sourceResource['extent'] = []
                    for element in OAI_QDC.split_lookup(record, './/{0}extent'.format(nameSpace_default['dcterms'])):
                        for term in element:
                            if len(term) > 0:
                                sourceResource['extent'].append(term.strip(' '))

                # sourceResource.format

                # sourceResource.genre
                if OAI_QDC.simple_lookup(record, './/{0}format'.format(nameSpace_default['dc'])) is not None:
                    sourceResource['genre'] = []
                    for element in OAI_QDC.split_lookup(record, './/{0}format'.format(nameSpace_default['dc'])):
                        for term in element:
                            if term.lower() in IANA_type_list:
                                file_format = term.lower()
                                pass
                            elif len(term) > 0:
                                sourceResource['genre'].append(term.strip(' '))
                    if len(sourceResource['genre']) == 0:
                        del sourceResource['genre']

                # sourceResource.identifier
                local_id = OAI_QDC.simple_lookup(record, './/{0}identifier'.format(nameSpace_default['dc']))
                if local_id is not None:
                    sourceResource['identifier'] = local_id[0]

                # sourceResource.language
                if OAI_QDC.simple_lookup(record, './/{0}language'.format(nameSpace_default['dc'])) is not None:
                    sourceResource['language'] = []
                    for element in OAI_QDC.split_lookup(record, './/{0}language'.format(nameSpace_default['dc'])):
                        for term in element:
                            if len(term) > 3:
                                sourceResource['language'] = {"name": term }
                            else:
                                sourceResource['language'] = { "iso_639_3": term }

                # sourceResource.place : sourceResource['spatial']
                if OAI_QDC.simple_lookup(record, './/{0}spatial'.format(nameSpace_default['dcterms'])) is not None:
                    sourceResource['spatial'] = []
                    for element in OAI_QDC.split_lookup(record, './/{0}spatial'.format(nameSpace_default['dcterms'])):
                        for term in element:
                            if len(term) > 0:
                                sourceResource['spatial'].append(term.strip(' '))

                # sourceResource.publisher
                publisher = OAI_QDC.simple_lookup(record, './/{0}publisher'.format(nameSpace_default['dc']))
                if publisher is not None:
                    sourceResource['publisher'] = publisher

                # sourceResource.relation

                # sourceResource.isReplacedBy

                # sourceResource.replaces

                # sourceResource.rights
                rightsURI = re.compile('http://rightsstatements')
                if OAI_QDC.simple_lookup(record, './/{0}rights'.format(nameSpace_default['dc'])) is not None:
                    if len(record.findall('.//{0}rights'.format(nameSpace_default['dc']))) > 1:
                        for rights_statement in OAI_QDC.simple_lookup(record, './/{0}rights'.format(nameSpace_default['dc'])):
                            URI = rightsURI.search(rights_statement)
                            if URI:
                                URI_match = URI.string.split(" ")[-1]
                            else:
                                rights_text = rights_statement
                        sourceResource['rights'] = { "@id": URI_match, "text": rights_text }
                    else:
                        sourceResource['rights'] = OAI_QDC.simple_lookup(record, './/{0}rights'.format(nameSpace_default['dc']))
                else:
                    logging.warning('No sourceResource.rights - {0}'.format(oai_id))
                    continue
                    
                # sourceResource.subject
                if OAI_QDC.simple_lookup(record, './/{0}subject'.format(nameSpace_default['dc'])) is not None:
                    sourceResource['subject'] = []
                    for element in OAI_QDC.split_lookup(record, './/{0}subject'.format(nameSpace_default['dc'])):
                        for term in element:
                            if len(term) > 0:
                                sourceResource['subject'].append({"name": term.strip(" ") })

                # sourceResource.title
                title = OAI_QDC.simple_lookup(record, './/{0}title'.format(nameSpace_default['dc']))
                if title is not None:
                    sourceResource['title'] = title
                else:
                    logging.warning('No sourceResource.title - {0}'.format(oai_id))
                    continue

                # sourceResource.type
                if OAI_QDC.simple_lookup(record, './/{0}type'.format(nameSpace_default['dc'])) is not None:
                    sourceResource['type'] = []
                    for element in OAI_QDC.split_lookup(record, './/{0}type'.format(nameSpace_default['dc'])):
                        for term in element:
                            if len(term) > 0:
                                sourceResource['type'].append(term.strip(" "))

                # webResource.fileFormat

                # aggregation.dataProvider
                data_provider = dprovide

                # aggregation.intermediateProvider

                # aggregation.isShownAt

                # aggregation.preview
                for identifier in local_id:
                    if 'http' in identifier:
                        is_shown_at = identifier
                        preview = assets.thumbnail_service(identifier, tn)

                # aggregation.provider

                docs.append({"@context": "http://api.dp.la/items/context",
                             "sourceResource": sourceResource,
                             "aggregatedCHO": "#sourceResource",
                             "dataProvider": data_provider,
                             "isShownAt": is_shown_at,
                             "preview": preview,
                             "provider": PROVIDER})
    return docs
Пример #5
0
def SSDN_DC(file_in, tn, dprovide, iprovide=None):
    def clean_mark_up(text):
        mark_up_re = re.compile('<.*?>')
        new_line_re = re.compile('\n')
        clean_text = re.sub(mark_up_re, '', text)
        clean_text = re.sub(new_line_re, ' ', clean_text)
        return clean_text

    with open(file_in, encoding='utf-8') as data_in:
        logger = assets.CSVLogger('SSDN_DC', provider=dprovide)
        records = OAIReader(data_in)
        docs = []
        for record in records:

            # deleted record handling for repox
            try:
                if 'deleted' in record.attrib.keys():
                    if record.attrib['deleted'] == 'true':
                        continue
            except AttributeError:
                pass

            # deleted record handling for OAI-PMH
            try:
                if 'status' in record.find('./{*}header').attrib.keys():
                    if record.find(
                            './{*}header').attrib['status'] == 'deleted':
                        continue
            except AttributeError:
                pass

            oai_id = record.oai_urn

            if VERBOSE:
                print(oai_id)
            logger.debug(oai_id)
            sourceResource = {}

            # sourceResource.alternative

            # sourceResource.collection
            if record.metadata.get_element('.//{0}relation'.format(dc)):
                sourceResource['collection'] = record.metadata.get_element(
                    './/{0}relation'.format(dc))

            # sourceResource.contributor
            if record.metadata.get_element('.//{0}contributor'.format(dc)):
                sourceResource['contributor'] = [{
                    "name": name
                } for name in record.metadata.get_element(
                    './/{0}contributor'.format(dc), delimiter=';')]

            # sourceResource.creator
            if record.metadata.get_element('.//{0}creator'.format(dc)):
                sourceResource['creator'] = []
                for name in record.metadata.get_element(
                        './/{0}creator'.format(dc), delimiter=';'):
                    # need to test for ( Contributor ) and ( contributor )
                    if (len(name) > 0) and ("ontributor )" not in name):
                        sourceResource['creator'].append(
                            {"name": name.strip(" ")})
                    elif "ontributor )" in name:
                        if 'contributor' not in sourceResource.keys():
                            sourceResource['contributor'] = []
                            sourceResource['contributor'].append({
                                "name":
                                name.strip(" ").rstrip("( Contributor )").
                                rstrip("( contributor )")
                            })
                        else:
                            sourceResource['contributor'].append({
                                "name":
                                name.strip(" ").rstrip("( Contributor )").
                                rstrip("( contributor )")
                            })

            # sourceResource.date
            date = record.metadata.get_element('.//{0}date'.format(dc))
            if date:
                try:
                    d = dateparser.parse(date[0],
                                         languages=['en']).date().isoformat()
                    sourceResource['date'] = {
                        "begin": d,
                        "end": d,
                        "displayDate": d
                    }
                except AttributeError as err:
                    logger.warning('sourceResource.date: {0}, {1}'.format(
                        err, record.oai_urn))
                    sourceResource['date'] = date[0]

            # sourceResource.description
            if record.metadata.get_element('.//{0}description'.format(dc)):
                sourceResource['description'] = [
                    clean_mark_up(desc)
                    for desc in record.metadata.get_element(
                        './/{0}description'.format(dc), delimiter=';')
                ]

            # sourceResource.extent

            # sourceResource.format
            if record.metadata.get_element('.//{0}format'.format(dc)):
                sourceResource['format'] = record.metadata.get_element(
                    './/{0}format'.format(dc))

            # sourceResource.genre

            # sourceResource.identifier
            sourceResource['identifier'] = oai_id

            # sourceResource.language
            if record.metadata.get_element('.//{0}language'.format(dc)):
                sourceResource['language'] = []
                for lang in record.metadata.get_element(
                        './/{0}language'.format(dc), delimiter=';'):
                    sourceResource['language'].append(lang)

            # sourceResource.place : sourceResource['spatial']
            if record.metadata.get_element('.//{0}coverage'.format(dc)):
                sourceResource['spatial'] = [{
                    'name': place
                } for place in record.metadata.get_element(
                    './/{0}coverage'.format(dc))]

            # sourceResource.publisher
            publisher = record.metadata.get_element(
                './/{0}publisher'.format(dc))
            if publisher:
                sourceResource['publisher'] = publisher

            # sourceResource.relation

            # sourceResource.isReplacedBy

            # sourceResource.replaces

            # sourceResource.rights
            rights_uri = re.compile('http://rightsstatements')
            if record.metadata.get_element('.//{0}rights'.format(dc)):
                for rights_statement in record.metadata.get_element(
                        './/{0}rights'.format(dc)):
                    uri = rights_uri.search(rights_statement)
                    if uri:
                        sourceResource['rights'] = [{
                            "@id": uri.string.strip()
                        }]
                        break
                    else:
                        sourceResource['rights'] = [{
                            "text":
                            rights_statement.strip()
                        }]

            else:
                logger.error('No sourceResource.rights - {0}'.format(oai_id))
                continue

            # sourceResource.subject
            if record.metadata.get_element('.//{0}subject'.format(dc)):
                sourceResource['subject'] = []
                for term in record.metadata.get_element(
                        './/{0}subject'.format(dc), delimiter=';'):
                    term = re.sub("\( lcsh \)$", '', term)
                    if len(term) > 0:
                        sourceResource['subject'].append(
                            {"name": term.strip(". ")})

            # sourceResource.temporal

            # sourceResource.title
            title = record.metadata.get_element('.//{0}title'.format(dc))
            if title is not None:
                sourceResource['title'] = title
            else:
                logger.error('No sourceResource.title - {0}'.format(oai_id))
                continue

            # sourceResource.type
            if record.metadata.get_element('.//{0}type'.format(dc)):
                sourceResource['type'] = record.metadata.get_element(
                    './/{0}type'.format(dc), delimiter=';')

            # webResource.fileFormat
            #  TODO: file_format kicked out of SR.genre

            # aggregation.dataProvider
            data_provider = dprovide

            # aggregation.intermediateProvider

            # aggregation.isShownAt
            for identifier in record.metadata.get_element(
                    './/{0}identifier'.format(dc)):
                if 'http' in identifier:
                    is_shown_at = identifier

            # aggregation.preview
            preview = None
            try:
                preview = assets.thumbnail_service(record, tn)
            except (TypeError, UnboundLocalError) as err:
                logger.warning('aggregation.preview: {0} - {1}'.format(
                    err, oai_id))
                pass

            # aggregation.provider

            # build record
            try:
                if is_shown_at:
                    doc = assets.build(oai_id, sourceResource, data_provider,
                                       is_shown_at, preview, iprovide)

                docs.append(doc)
            except (NameError, UnboundLocalError):
                logger.error('No aggregation.isShownAt - {0}'.format(oai_id))
                continue

    return docs
Пример #6
0
def FlaLD_MODS(file_in, tn, dprovide, iprovide=None):
    with open(file_in, encoding='utf-8') as data_in:
        records = MODSReader(data_in)
        docs = []
        for record in records:

            sourceResource = {}

            # sourceResource.alternative
            if record.title_constructor() is not None and record.title_constructor()[1:] is not None:
                sourceResource['alternative'] = []
                if len(record.title_constructor()[1:]) >= 1:
                    for alternative_title in record.title_constructor()[1:]:
                        sourceResource['alternative'].append(alternative_title)

            # sourceResource.collection
            if record.collection() is not None:
                collection = record.collection()
                sourceResource['collection'] = {}
                if 'title' in collection.keys():
                    sourceResource['collection']['name'] = collection['title']
                if 'location' in collection.keys():
                    sourceResource['collection']['host'] = collection['location']
                if 'url' in collection.keys():
                    sourceResource['collection']['_:id'] = collection['url']

            # sourceResource.contributor
            try:

                if record.name_constructor() is not None:
                    sourceResource['contributor'] = []
                    for name in record.name_constructor():

                        if any(key in name.keys() for key in ['roleText', 'roleCode']) is False:
                            if 'valueURI' in name.keys():
                                sourceResource['contributor'].append({"@id": name['valueURI'],
                                                                      "name": name['text']} )
                            else:
                                sourceResource['contributor'].append({"name": name['text']} )

                        elif 'roleText' in name.keys():
                            if name['roleText'].lower() != 'creator':
                                if 'valueURI' in name.keys():
                                    sourceResource['contributor'].append({ "@id": name['valueURI'],
                                                                       "name": name['text'] })
                                else:
                                    sourceResource['contributor'].append({ "name": name['text'] })
                        elif 'roleCode' in name.keys():
                            if name['roleCode'].lower() != 'cre':
                                if 'valueURI' in name.keys():
                                    sourceResource['contributor'].append({ "@id": name['valueURI'],
                                                                       "name": name['text'] })
                                else:
                                    sourceResource['contributor'].append({ "name": name['text'] })

                        else:
                            pass

                    if len(sourceResource['contributor']) < 1:
                        del sourceResource['contributor']

            except KeyError as err:
                logging.warning('sourceResource.contributor: {0}, {1}\n'.format(err, record.pid_search()))
                pass

            if record.name_constructor() is not None:
                sourceResource['creator'] = []
                for name in record.name_constructor():

                    if 'roleText' in name.keys():
                        if name['roleText'].lower() == 'creator':
                            if 'valueURI' in name.keys():
                                sourceResource['creator'].append({ "@id": name['valueURI'],
                                                                   "name": name['text'] })
                            else:
                                sourceResource['creator'].append({ "name": name['text'] })
                    elif 'roleCode' in name.keys():
                        if name['roleCode'].lower() == 'cre':
                            if 'valueURI' in name.keys():
                                sourceResource['creator'].append({ "@id": name['valueURI'],
                                                                   "name": name['text'] })
                            else:
                                sourceResource['creator'].append({ "name": name['text'] })
                    else:
                        pass

            # sourceResource.date
            if record.date_constructor() is not None:
                date = record.date_constructor()
                if ' - ' in date:
                    sourceResource['date'] = { "displayDate": date,
                                               "begin": date[0:4],
                                               "end": date[-4:] }
                else:
                    sourceResource['date'] = { "displayDate": date,
                                               "begin": date,
                                               "end": date }

            # sourceResource.description
            if record.abstract() is not None:
                if len(record.abstract()) > 1:
                    sourceResource['description'] = []
                    for description in record.abstract():
                        sourceResource['description'].append(description)
                else:
                    sourceResource['description'] = record.abstract()

            # sourceResource.extent
            if record.extent() is not None:
                if len(record.extent()) > 1:
                    sourceResource['extent'] = []
                    for extent in record.extent():
                        sourceResource['extent'].append(extent)
                else:
                    sourceResource['extent'] = record.extent()[0]

            # sourceResource.format
            if record.form() is not None:
                if len(record.form()) > 1:
                    sourceResource['format'] = []
                    for form in record.form():
                        sourceResource['format'].append(form)
                else:
                    sourceResource['format'] = record.form()[0]

            # sourceResource.genre
            if record.genre() is not None:
                if len(record.genre()) > 1:
                    sourceResource['genre'] = []
                    for genre in record.genre():
                        genre_elem = {}
                        for key, value in genre.items():
                            if 'term' == key:
                                genre_elem['name'] = value
                            elif 'valueURI' == key:
                                genre_elem['@id'] = value
                        sourceResource['genre'].append(genre_elem)
                else:
                    genre_elem = {}
                    for key, value in record.genre()[0].items():
                        if 'term' == key:
                            genre_elem['name'] = value
                        elif 'valueURI' == key:
                            genre_elem['@id'] = value
                    sourceResource['genre'] = genre_elem

            # sourceResource.identifier
            sourceResource['identifier'] = { "@id": record.purl_search(),
                                             "text": record.local_identifier() }

            # sourceResource.language
            if record.language() is not None:
                language_list = []
                for language in record.language():
                    if len(language) > 1:
                        language_dict = { "name": language['text'],
                                          "iso_639_3": language['code'] }
                    else:
                        if 'text' in language.keys():
                            language_dict = { "name": language['text'] }
                        else:
                            pass
                    language_list.append(language_dict)
                sourceResource['language'] = language_list

            # sourceResource.place : sourceResource['spatial']
            geo_code_list = record.geographic_code()
            if geo_code_list is not None:
                sourceResource['spatial'] = []
                for geo_code in geo_code_list:

                    code, lat, long, label = assets.tgn_cache(geo_code)
                    sourceResource['spatial'].append({"lat": lat,
                                                      "long": long,
                                                      "name": label,
                                                      "_:attribution": "This record contains information from Thesaurus of Geographic Names (TGN) which is made available under the ODC Attribution License." })

                    #tgn_prefix = 'http://vocab.getty.edu/tgn/'

                    '''
                    # Implementation using the schema.org namespace
                    # tgn_geometry = geo_code + '-geometry.jsonld'
                    # geometry = requests.get(tgn_prefix + tgn_geometry)
                    # geometry_json = json.loads(geometry.text)
                    # lat = geometry_json['http://schema.org/latitude']['@value']
                    # long = geometry_json['http://schema.org/longitude']['@value']
                    '''

                    #tgn_place = geo_code + '-place.jsonld'
                    #place = requests.get(tgn_prefix + tgn_place)
                    #if place.status_code == 200:
                    #    place_json = json.loads(place.text)
                    #    lat = place_json['http://www.w3.org/2003/01/geo/wgs84_pos#lat']['@value']
                    #    long = place_json['http://www.w3.org/2003/01/geo/wgs84_pos#long']['@value']
                    #    sourceResource['spatial'].append({ "lat": lat,
                    #                                       "long": long,
                    #                                       "_:attribution": "This record contains information from Thesaurus of Geographic Names (TGN) which is made available under the ODC Attribution License." })

            # sourceResource.publisher
            if record.publisher() is not None:
                if len(record.publisher()) > 1:
                    sourceResource['publisher'] = []
                    for publisher in record.publisher():
                        sourceResource['publisher'].append(publisher)
                else:
                    sourceResource['publisher'] = record.publisher()[0]

            # sourceResource.relation

            # sourceResource.isReplacedBy

            # sourceResource.replaces

            # sourceResource.rights
            if record.rights() is not None:
                if len(record.rights()) > 1:
                    sourceResource['rights'] = {"@id": record.rights()['URI'],
                                                "text": record.rights()['text']}
                else:
                    sourceResource['rights'] = record.rights()['text']
            else:
                logging.warning('No sourceResource.rights - {0}'.format(record.pid_search()))
                continue

            # sourceResource.subject
            try:

                if record.subject() is not None:
                    sourceResource['subject'] = []
                    for subject in record.subject():
                        non_alpha_char = re.compile("^[^a-zA-Z]+$")
                        if non_alpha_char.match(subject['text']) is None:

                            if 'valueURI' in subject.keys():
                                sourceResource['subject'].append({"@id": subject['valueURI'],
                                                                  "name": subject['text'] })
                            else:
                                sourceResource['subject'].append({"name": subject['text'] })
                        else:
                            pass

            except TypeError as err:
                logging.warning('sourceResource.subject: {0}, {1}\n'.format(err, record.pid_search()))
                pass

            # sourceResource.title
            if record.title_constructor() is not None:
                sourceResource['title'] = record.title_constructor()[0]
            else:
                logging.warning('No sourceResource.title: {0}'.format(record.pid_search()))
                continue

            # sourceResource.type
            sourceResource['type'] = record.type_of_resource()

            # aggregation.dataProvider
            data_provider = dprovide

            # aggregation.intermediateProvider #TODO

            # aggregation.isShownAt

            # aggregation.preview
            pid = record.pid_search()
            preview = assets.thumbnail_service(pid, tn)

            # aggregation.provider

            docs.append({"@context": "http://api.dp.la/items/context",
                         "sourceResource": sourceResource,
                         "aggregatedCHO": "#sourceResource",
                         "dataProvider": data_provider,
                         "isShownAt": record.purl_search(),
                         "preview": preview,
                         "provider": PROVIDER})
        return docs
Пример #7
0
def FlaLD_DC(file_in, tn, dprovide, iprovide=None):
    with open(file_in, encoding='utf-8') as data_in:
        records = OAI_QDC(data_in)
        docs = []
        for record in records.record_list:

            if 'deleted' in record.attrib.keys():
                if record.attrib['deleted'] == 'true':
                    pass

            else:
                oai_id = record.attrib['id']

                sourceResource = {}

                # sourceResource.alternative

                # sourceResource.collection

                # sourceResource.contributor
                if OAI_QDC.simple_lookup(record, './/{0}contributor'.format(nameSpace_default['dc'])) is not None:
                    sourceResource['contributor'] = []
                    for element in OAI_QDC.split_lookup(record, './/{0}contributor'.format(nameSpace_default['dc'])):
                        for name in element:
                            if len(name) > 0:
                                sourceResource['contributor'].append({"name": name.strip(" ") })


                # sourceResource.creator
                if OAI_QDC.simple_lookup(record, './/{0}creator'.format(nameSpace_default['dc'])) is not None:
                    sourceResource['creator'] = []
                    for element in OAI_QDC.split_lookup(record, './/{0}creator'.format(nameSpace_default['dc'])):
                        for name in element:
                            # need to test for ( Contributor ) and ( contributor )
                            if len(name) > 0 and "ontributor )" not in name:
                                sourceResource['creator'].append({"name": name.strip(" ") })
                            elif "ontributor )" in name:
                                if 'contributor' not in sourceResource.keys():
                                    sourceResource['contributor'] = []
                                    sourceResource['contributor'].append({"name": name.strip(" ").rstrip("( Contributor )").rstrip("( contributor )")})
                                else:
                                    sourceResource['contributor'].append(
                                        {"name": name.strip(" ").rstrip("( Contributor )").rstrip("( contributor )")})

                # sourceResource.date
                date = OAI_QDC.simple_lookup(record, './/{0}date'.format(nameSpace_default['dc']))
                if date is not None:
                    sourceResource['date'] = { "begin": date[0], "end": date[0] }

                # sourceResource.description
                description = []
                if OAI_QDC.simple_lookup(record, './/{0}description'.format(nameSpace_default['dc'])) is not None:
                    for item in OAI_QDC.simple_lookup(record, './/{0}description'.format(nameSpace_default['dc'])):
                        description.append(item)
                if len(description) > 1:
                    sourceResource['description'] = []
                    for item in description:
                        sourceResource['description'].append(item)
                elif len(description) == 1:
                    sourceResource['description'] = description[0]

                # sourceResource.extent

                # sourceResource.format
                dpla_format = OAI_QDC.simple_lookup(record, './/{0}format'.format(nameSpace_default['dc']))
                if dpla_format is not None:
                    sourceResource['format'] = dpla_format

                # sourceResource.genre

                # sourceResource.identifier
                dPantherPURL = re.compile('dpService/dpPurlService/purl')
                identifier = OAI_QDC.simple_lookup(record, './/{0}identifier'.format(nameSpace_default['dc']))
                if identifier is not None and len(identifier) > 1:
                    sourceResource['identifier'] = []
                    for ID in identifier:
                        try:
                            PURL = dPantherPURL.search(ID)
                            if PURL:
                                PURL_match = PURL.string
                            else:
                                sourceResource['identifier'].append(ID)
                        except TypeError as err:
                            logging.warning('sourceResource.identifier: {0} - {1}\n'.format(err, oai_id))
                            pass
                else:
                    sourceResource['identifier'] = identifier

                # sourceResource.language
                if OAI_QDC.simple_lookup(record, './/{0}language'.format(nameSpace_default['dc'])) is not None:
                    sourceResource['language'] = []
                    for element in OAI_QDC.split_lookup(record, './/{0}language'.format(nameSpace_default['dc'])):
                        for term in element:
                            if len(term) > 3:
                                sourceResource['language'] = {"name": term }
                            else:
                                sourceResource['language'] = { "iso_639_3": term }

                # sourceResource.place : sourceResource['spatial']
                place = OAI_QDC.simple_lookup(record, './/{0}coverage'.format(nameSpace_default['dc']))
                if place is not None:
                    sourceResource['spatial'] = place

                # sourceResource.publisher
                publisher = OAI_QDC.simple_lookup(record, './/{0}publisher'.format(nameSpace_default['dc']))
                if publisher is not None:
                    sourceResource['publisher'] = publisher

                # sourceResource.relation

                # sourceResource.isReplacedBy

                # sourceResource.replaces

                # sourceResource.rights
                rights = OAI_QDC.simple_lookup(record, './/{0}rights'.format(nameSpace_default['dc']))
                if rights is not None:
                    sourceResource['rights'] = rights
                else:
                    logging.warning('No sourceResource.rights - {0}'.format(oai_id))
                    continue

                # sourceResource.subject
                if OAI_QDC.simple_lookup(record, './/{0}subject'.format(nameSpace_default['dc'])) is not None:
                    sourceResource['subject'] = []
                    for element in OAI_QDC.split_lookup(record, './/{0}subject'.format(nameSpace_default['dc'])):
                        for term in element:
                            term = re.sub("\( lcsh \)$", '', term)
                            if len(term) > 0:
                                sourceResource['subject'].append({"name": term.strip(" ") })

                # sourceResource.title
                title = OAI_QDC.simple_lookup(record, './/{0}title'.format(nameSpace_default['dc']))
                if title is not None:
                    sourceResource['title'] = title
                else:
                    logging.warning('No sourceResource.rights - {0}'.format(oai_id))
                    continue

                # sourceResource.type
                if OAI_QDC.simple_lookup(record, './/{0}type'.format(nameSpace_default['dc'])) is not None:
                    sourceResource['type'] = []
                    for element in OAI_QDC.split_lookup(record, './/{0}type'.format(nameSpace_default['dc'])):
                        for term in element:
                            if len(term) > 0:
                                sourceResource['type'].append(term.strip(" "))

                # webResource.fileFormat

                # aggregation.dataProvider
                data_provider = dprovide

                # aggregation.intermediateProvider

                # aggregation.isShownAt

                # aggregation.preview
                preview = assets.thumbnail_service(PURL_match, tn)

                # aggregation.provider

                try:
                    docs.append({"@context": "http://api.dp.la/items/context",
                                 "sourceResource": sourceResource,
                                 "aggregatedCHO": "#sourceResource",
                                 "dataProvider": data_provider,
                                 "isShownAt": PURL_match,
                                 "preview": preview,
                                 "provider": PROVIDER})
                except NameError as err:
                    logging.warning('aggregation.preview: {0} - {1}\n'.format(err, oai_id))
                    pass

    return docs
Пример #8
0
def FlMem(file_in, tn, dprovide, iprovide=None):
    with open(file_in, encoding='utf-8') as data_in:
        logger = assets.CSVLogger('FlMem', provider=dprovide)
        records = OAIReader(data_in)
        docs = []
        for record in records:

            # deleted record handling for repox
            try:
                if 'deleted' in record.attrib.keys():
                    if record.attrib['deleted'] == 'true':
                        continue
            except AttributeError:
                pass

            # deleted record handling for OAI-PMH
            try:
                if 'status' in record.find('./{*}header').attrib.keys():
                    if record.find(
                            './{*}header').attrib['status'] == 'deleted':
                        continue
            except AttributeError:
                pass

            oai_id = record.oai_urn

            if VERBOSE:
                print(oai_id)
            logger.debug(oai_id)
            sourceResource = {}

            # sourceResource.alternative

            # sourceResource.collection
            if record.metadata.get_element('.//{0}source'.format(dc)):
                sourceResource['collection'] = {
                    'name':
                    record.metadata.get_element('.//{0}source'.format(dc))[0]
                }

            # sourceResource.contributor
            if record.metadata.get_element('.//{0}contributor'.format(dc)):
                sourceResource['contributor'] = [{
                    "name": name
                } for name in record.metadata.get_element(
                    './/{0}contributor'.format(dc), delimiter=';')]

            # sourceResource.creator
            if record.metadata.get_element('.//{0}creator'.format(dc)):
                sourceResource['creator'] = []
                for name in record.metadata.get_element(
                        './/{0}creator'.format(dc), delimiter=';'):
                    # need to test for ( Contributor ) and ( contributor )
                    if (len(name) > 0) and ("ontributor )" not in name):
                        sourceResource['creator'].append(
                            {"name": name.strip(" ")})
                    elif "ontributor )" in name:
                        if 'contributor' not in sourceResource.keys():
                            sourceResource['contributor'] = []
                            sourceResource['contributor'].append({
                                "name":
                                name.strip(" ").rstrip("( Contributor )").
                                rstrip("( contributor )")
                            })
                        else:
                            sourceResource['contributor'].append({
                                "name":
                                name.strip(" ").rstrip("( Contributor )").
                                rstrip("( contributor )")
                            })

            # sourceResource.date
            date = record.metadata.get_element('.//{0}date'.format(dc))
            if date:
                sourceResource['date'] = {
                    "begin": date[0],
                    "end": date[0],
                    "displayDate": date[0]
                }

            # sourceResource.description
            if record.metadata.get_element('.//{0}description'.format(dc)):
                sourceResource['description'] = record.metadata.get_element(
                    './/{0}description'.format(dc), delimiter=';')

            # sourceResource.extent

            # sourceResource.format
            if record.metadata.get_element('.//{0}format'.format(dc)):
                sourceResource['format'] = record.metadata.get_element(
                    './/{0}format'.format(dc))

            # sourceResource.genre

            # sourceResource.identifier
            for identifier in record.metadata.get_element(
                    './/{0}identifier'.format(dc)):
                if 'http' in identifier:
                    is_shown_at = identifier.replace(
                        identifier.split('/')[2], 'www.floridamemory.com')
                    is_shown_at = is_shown_at.replace('http:', 'https:')
            sourceResource['identifier'] = oai_id.replace(
                oai_id.split(':')[1], 'www.floridamemory.com')

            # sourceResource.language
            if record.metadata.get_element('.//{0}language'.format(dc)):
                sourceResource['language'] = []
                for lang in record.metadata.get_element(
                        './/{0}language'.format(dc), delimiter=';'):
                    results = assets.iso639_2code(lang.split('-')[0])
                    sourceResource['language'].append(results)

            # sourceResource.place : sourceResource['spatial']
            if record.metadata.get_element('.//{0}coverage'.format(dc)):
                sourceResource['spatial'] = [{
                    'name': place
                } for place in record.metadata.get_element(
                    './/{0}coverage'.format(dc))]

            # sourceResource.publisher
            if record.metadata.get_element('.//{0}publisher'.format(dc)):
                sourceResource['publisher'] = record.metadata.get_element(
                    './/{0}publisher'.format(dc))

            # sourceResource.relation

            # sourceResource.isReplacedBy

            # sourceResource.replaces

            # sourceResource.rights  # TODO: hard-coding is only temporary
            sourceResource['rights'] = {
                '@id': 'http://rightsstatements.org/vocab/NoC-US/1.0/'
            }

            # rights = record.metadata.get_element('.//{0}rights'.format(dc))
            # if rights:
            #     sourceResource['rights'] = [{'text': rights[0]}]
            # else:
            #     logger.error('No sourceResource.rights - {0}'.format(oai_id))
            #     # continue  # TODO renable for prod
            #     pass  # local test

            # sourceResource.subject
            if record.metadata.get_element('.//{0}subject'.format(dc)):
                sourceResource['subject'] = []
                for term in record.metadata.get_element(
                        './/{0}subject'.format(dc), delimiter=';'):
                    term = re.sub("\( lcsh \)$", '', term)
                    if len(term) > 0:
                        sourceResource['subject'].append(
                            {"name": term.strip(". ")})

            # sourceResource.title
            title = record.metadata.get_element('.//{0}title'.format(dc))
            if title:
                sourceResource['title'] = title
            else:
                logger.error('No sourceResource.rights - {0}'.format(oai_id))
                continue

            # sourceResource.temporal
            temporal = record.metadata.get_element('.//{0}coverage'.format(dc))
            if temporal:
                sourceResource['temporal'] = temporal

            # sourceResource.type
            if record.metadata.get_element('.//{0}type'.format(dc)):
                if 'type' in sourceResource.keys():
                    sourceResource['type'] = sourceResource[
                        'type'] + record.metadata.get_element(
                            './/{0}type'.format(dc))
                else:
                    sourceResource['type'] = record.metadata.get_element(
                        './/{0}type'.format(dc), delimiter=';')
            if record.metadata.get_element('.//{0}format'.format(dc)):
                if 'type' in sourceResource.keys():
                    sourceResource['type'] = sourceResource[
                        'type'] + record.metadata.get_element(
                            './/{0}format'.format(dc))
                else:
                    sourceResource['type'] = record.metadata.get_element(
                        './/{0}format'.format(dc), delimiter=';')

            # webResource.fileFormat

            # aggregation.dataProvider
            data_provider = dprovide

            # aggregation.intermediateProvider

            # aggregation.isShownAt

            # aggregation.preview
            preview = None
            try:
                preview = assets.thumbnail_service(is_shown_at, tn)
            except UnboundLocalError as err:
                logger.error('aggregation.preview: {0} - {1}'.format(
                    err, oai_id))
                pass

            # aggregation.provider

            # build record
            try:
                if is_shown_at:
                    doc = assets.build(oai_id, sourceResource, data_provider,
                                       is_shown_at, preview, iprovide)

                docs.append(doc)
            except UnboundLocalError:
                logger.error('No aggregation.isShownAt - {0}'.format(oai_id))
                continue

    return docs
Пример #9
0
def FlaLD_MODS(file_in, tn, dprovide, iprovide=None):
    with open(file_in, encoding='utf-8') as data_in:
        logger = assets.CSVLogger('FlaLD_MODS', provider=dprovide)
        records = OAIReader(data_in)
        docs = []
        for record in records:

            # deleted record handling for repox
            try:
                if 'deleted' in record.attrib.keys():
                    if record.attrib['deleted'] == 'true':
                        pass
            except AttributeError:
                pass

            # deleted record handling for OAI-PMH
            try:
                if 'status' in record.find('./{*}header').attrib.keys():
                    if record.find(
                            './{*}header').attrib['status'] == 'deleted':
                        pass
            except AttributeError:
                pass

            if VERBOSE:
                print(record.oai_urn)
            logger.debug(record.oai_urn)
            sourceResource = {}

            if record.metadata is None:
                continue

            # sourceResource.alternative
            if len(record.metadata.titles) > 1:
                sourceResource['alternative'] = []
                if len(record.metadata.titles[1:]) >= 1:
                    for alternative_title in record.metadata.titles[1:]:
                        sourceResource['alternative'].append(alternative_title)

            # sourceResource.collection
            if record.metadata.collection:
                collection = record.metadata.collection
                sourceResource['collection'] = {}
                if collection.title:
                    sourceResource['collection']['name'] = collection.title
                if collection.location:
                    sourceResource['collection']['host'] = collection.location
                if collection.url:
                    sourceResource['collection']['_:id'] = collection.url

            # sourceResource.contributor
            try:

                for name in record.metadata.names:
                    if name.role.text != 'Creator' or name.role.code != 'cre':
                        sourceResource['contributor'] = [{
                            "@id": name.uri,
                            "name": name.text
                        } if name.uri else {
                            "name": name.text
                        }]
            except KeyError as err:
                logger.error('sourceResource.contributor: {0}, {1}'.format(
                    err, record.oai_urn))
                pass

            # sourceResource.creator
            if record.metadata.get_creators:
                sourceResource['creator'] = [{
                    "@id": name.uri,
                    "name": name.text
                } if name.uri else {
                    "name": name.text
                } for name in record.metadata.get_creators]

            # sourceResource.date
            if record.metadata.dates:
                date = record.metadata.dates[0].text
                if ' - ' in date:
                    sourceResource['date'] = {
                        "displayDate": date,
                        "begin": date[0:4],
                        "end": date[-4:]
                    }
                else:
                    sourceResource['date'] = {
                        "displayDate": date,
                        "begin": date,
                        "end": date
                    }

            # sourceResource.description
            if record.metadata.abstract:
                sourceResource['description'] = [
                    abstract.text for abstract in record.metadata.abstract
                ]

            # sourceResource.extent
            if record.metadata.extent:
                sourceResource['extent'] = record.metadata.extent

            # sourceResource.format
            if record.metadata.form:
                sourceResource['format'] = record.metadata.form

            # sourceResource.genre
            if record.metadata.genre:
                sourceResource['genre'] = [{
                    'name': genre.text,
                    '@id': genre.uri
                } if genre.uri else {
                    'name': genre.text
                } for genre in record.metadata.genre]

            # sourceResource.identifier
            try:
                sourceResource['identifier'] = record.metadata.purl[0]
            except IndexError as err:
                logger.error('sourceResource.identifier: {0}, {1}'.format(
                    err, record.oai_urn))
                continue

            # sourceResource.language
            try:
                if record.metadata.language:
                    sourceResource['language'] = [{
                        "name": lang.text,
                        "iso_639_3": lang.code
                    } for lang in record.metadata.language]
            except AttributeError as err:
                logger.error('sourceResource.language: {0}, {1}'.format(
                    err, record.oai_urn))
                pass

            # sourceResource.place : sourceResource['spatial']
            try:
                if record.metadata.geographic_code and len(
                        record.metadata.geographic_code) > 0:
                    sourceResource['spatial'] = []
                    for geo_code in record.metadata.geographic_code:
                        code, lat, long, label = assets.tgn_cache(
                            geo_code.strip())
                        sourceResource['spatial'].append({
                            "lat":
                            lat,
                            "long":
                            long,
                            "name":
                            label,
                            "_:attribution":
                            "This record contains information from Thesaurus of Geographic Names (TGN) which is made available under the ODC Attribution License."
                        })
            except TypeError as err:
                logger.error('sourceResource.spatial: {0}, {1}'.format(
                    err, record.oai_urn))
                continue

            # sourceResource.publisher
            if record.metadata.publisher:
                sourceResource['publisher'] = record.metadata.publisher

            # sourceResource.relation

            # sourceResource.isReplacedBy

            # sourceResource.replaces

            # sourceResource.rights
            if record.metadata.rights:
                sourceResource['rights'] = [{
                    "@id": rights.uri
                } if rights.uri else {
                    "text": rights.text
                } for rights in record.metadata.rights]
            else:
                logger.error('No sourceResource.rights - {0}'.format(
                    record.oai_urn))
                continue

            # sourceResource.subject
            try:

                if record.metadata.subjects:
                    sourceResource['subject'] = [{
                        "@id": subject.uri,
                        "name": subject.text
                    } if subject.uri is not None else {
                        "name": subject.text
                    } for subject in record.metadata.subjects]
            except (TypeError, IndexError) as err:
                logger.error('sourceResource.subject: {0}, {1}'.format(
                    err, record.oai_urn))
                pass

            # sourceResource.title
            if record.metadata.titles:
                sourceResource['title'] = [
                    '{}'.format(record.metadata.titles[0])
                ]
            else:
                logger.error('No sourceResource.title: {0}'.format(
                    record.oai_urn))
                continue

            # sourceResource.type
            sourceResource['type'] = record.metadata.type_of_resource

            # aggregation.dataProvider
            first_baptist = re.compile('^FSU_FBCTLH')
            leon_high = re.compile('^FSU_LeonHigh')
            godby_high = re.compile('^FSU_Godby')
            havana_hhs = re.compile('^FSU_HHHS')
            # ringling = re.compile('^FSU_Ringling')
            first_baptist_iid = first_baptist.search(record.metadata.iid)
            leon_high_iid = leon_high.search(record.metadata.iid)
            godby_high_iid = godby_high.search(record.metadata.iid)
            havana_hhs_iid = havana_hhs.search(record.metadata.iid)
            # ringling_iid = ringling.search(record.metadata.iid)
            if first_baptist_iid:
                data_provider = 'First Baptist Church of Tallahassee'
                iprovide = 'Florida State University Libraries'
            elif leon_high_iid:
                data_provider = 'Leon High School, Tallahassee, Florida'
                iprovide = 'Florida State University Libraries'
            elif godby_high_iid:
                data_provider = 'Godby High School, Tallahassee, Florida'
                iprovide = 'Florida State University Libraries'
            elif havana_hhs_iid:
                data_provider = 'Havana History & Heritage Society, Havana, Florida'
                iprovide = 'Florida State University Libraries'
            # elif ringling_iid:
            #     data_provider = 'John and Mable Ringling Museum of Art'
            #     iprovide = 'Florida State University Libraries'
            else:
                data_provider = dprovide

            # aggregation.intermediateProvider

            # aggregation.isShownAt

            # aggregation.preview
            preview = None
            pid = record.metadata.pid
            if pid is None:
                pid = record.oai_urn.split(':')[-1].replace('_', ':')
            preview = assets.thumbnail_service(pid, tn)

            # aggregation.provider

            # build record
            try:
                if record.metadata.purl[0]:
                    doc = assets.build(record.oai_urn, sourceResource,
                                       data_provider, record.metadata.purl[0],
                                       preview, iprovide)

                docs.append(doc)
            except UnboundLocalError:
                logger.error('No aggregation.isShownAt - {0}'.format(
                    record.oai_urn))
                continue

    return docs
Пример #10
0
def FlaLD_DC(file_in, tn, dprovide, iprovide=None):
    with open(file_in, encoding='utf-8') as data_in:
        logger = assets.CSVLogger('FlaLD_DC', provider=dprovide)
        records = OAIReader(data_in)
        docs = []
        for record in records:

            # deleted record handling for repox
            try:
                if 'deleted' in record.attrib.keys():
                    if record.attrib['deleted'] == 'true':
                        continue
            except AttributeError:
                pass

            # deleted record handling for OAI-PMH
            try:
                if 'status' in record.find('./{*}header').attrib.keys():
                    if record.find(
                            './{*}header').attrib['status'] == 'deleted':
                        continue
            except AttributeError:
                pass

            oai_id = record.oai_urn

            if VERBOSE:
                print(oai_id)
            logger.debug(oai_id)
            sourceResource = {}

            # sourceResource.alternative

            # sourceResource.collection

            # sourceResource.contributor
            if record.metadata.get_element('.//{0}contributor'.format(dc)):
                sourceResource['contributor'] = [{
                    "name": name
                } for name in record.metadata.get_element(
                    './/{0}contributor'.format(dc), delimiter=';')]

            # sourceResource.creator
            if record.metadata.get_element('.//{0}creator'.format(dc)):
                sourceResource['creator'] = []
                for name in record.metadata.get_element(
                        './/{0}creator'.format(dc), delimiter=';'):
                    # need to test for ( Contributor ) and ( contributor )
                    if (len(name) > 0) and ("ontributor )" not in name):
                        sourceResource['creator'].append(
                            {"name": name.strip(" ")})
                    elif "ontributor )" in name:
                        if 'contributor' not in sourceResource.keys():
                            sourceResource['contributor'] = []
                            sourceResource['contributor'].append({
                                "name":
                                name.strip(" ").rstrip("( Contributor )").
                                rstrip("( contributor )")
                            })
                        else:
                            sourceResource['contributor'].append({
                                "name":
                                name.strip(" ").rstrip("( Contributor )").
                                rstrip("( contributor )")
                            })

            # sourceResource.date
            date = record.metadata.get_element('.//{0}date'.format(dc))
            if date:
                sourceResource['date'] = {
                    "begin": date[0],
                    "end": date[0],
                    "displayDate": date[0]
                }

            # sourceResource.description
            if record.metadata.get_element('.//{0}description'.format(dc)):
                sourceResource['description'] = record.metadata.get_element(
                    './/{0}description'.format(dc), delimiter=';')

            # sourceResource.extent

            # sourceResource.format
            if record.metadata.get_element('.//{0}format'.format(dc)):
                sourceResource['format'] = record.metadata.get_element(
                    './/{0}format'.format(dc))

            # sourceResource.genre

            # sourceResource.identifier
            dPantherPURL = re.compile(
                'http://dpanther.fiu.edu/dpService/dpPurlService')
            dPantherURL = re.compile('http://dpanther')
            identifier = record.metadata.get_element(
                './/{0}identifier'.format(dc))
            try:
                for ID in identifier:
                    if dPantherPURL.search(ID):
                        PURL_match = ID
                        sourceResource['identifier'] = ID
                        break
                    elif dPantherURL.search(ID):
                        sourceResource['identifier'] = ID
                        logger.warning(
                            'sourceResource.identifier: {0} - {1}'.format(
                                'Not a PURL', oai_id))
                is_shown_at = sourceResource['identifier']

            except (TypeError, UnboundLocalError) as err:
                logger.error('sourceResource.identifier: {0} - {1}'.format(
                    err, oai_id))
                continue

            # sourceResource.language
            if record.metadata.get_element('.//{0}language'.format(dc)):
                sourceResource['language'] = []
                for element in record.metadata.get_element(
                        './/{0}language'.format(dc), delimiter=';'):
                    if len(element) > 3:
                        sourceResource['language'].append({"name": element})
                    else:
                        sourceResource['language'].append(
                            {"iso_639_3": element})

            # sourceResource.place : sourceResource['spatial']
            if record.metadata.get_element('.//{0}coverage'.format(dc)):
                sourceResource['spatial'] = [{
                    'name': place
                } for place in record.metadata.get_element(
                    './/{0}coverage'.format(dc))]

            # sourceResource.publisher
            if record.metadata.get_element('.//{0}publisher'.format(dc)):
                sourceResource['publisher'] = record.metadata.get_element(
                    './/{0}publisher'.format(dc))

            # sourceResource.relation

            # sourceResource.isReplacedBy

            # sourceResource.replaces

            # sourceResource.rights
            rights = record.metadata.get_element('.//{0}rights'.format(dc))
            if rights:
                sourceResource['rights'] = [{'text': rights[0]}]
            else:
                logger.error('No sourceResource.rights - {0}'.format(oai_id))
                continue

            # sourceResource.subject
            if record.metadata.get_element('.//{0}subject'.format(dc)):
                sourceResource['subject'] = []
                for term in record.metadata.get_element(
                        './/{0}subject'.format(dc), delimiter=';'):
                    term = re.sub("\( lcsh \)$", '', term)
                    if len(term) > 0:
                        sourceResource['subject'].append(
                            {"name": term.strip(" ")})

            # sourceResource.title
            title = record.metadata.get_element('.//{0}title'.format(dc))
            if title:
                sourceResource['title'] = title
            else:
                logger.error('No sourceResource.rights - {0}'.format(oai_id))
                continue

            # sourceResource.type
            if record.metadata.get_element('.//{0}type'.format(dc)):
                sourceResource['type'] = record.metadata.get_element(
                    './/{0}type'.format(dc), delimiter=';')

            # webResource.fileFormat

            # aggregation.dataProvider
            data_provider = dprovide

            # aggregation.intermediateProvider

            # aggregation.isShownAt

            # aggregation.preview
            preview = None
            try:
                preview = assets.thumbnail_service(record, tn)
            except (TypeError, UnboundLocalError) as err:
                logger.warning('aggregation.preview: {0} - {1}'.format(
                    err, oai_id))
                pass

            # aggregation.provider

            # build record
            try:
                if is_shown_at:
                    doc = assets.build(oai_id, sourceResource, data_provider,
                                       is_shown_at, preview, iprovide)

                docs.append(doc)
            except UnboundLocalError:
                logger.error('No aggregation.isShownAt - {0}'.format(oai_id))
                continue

    return docs