示例#1
0
def format_book_data(book):
    d = web.storage()
    d.key = book.get('key')
    d.url = book.url()
    d.title = book.title or None
    d.ocaid = book.get("ocaid")
    d.eligibility = book.get("eligibility", {})
    d.availability = book.get('availability', {})

    def get_authors(doc):
        return [web.storage(key=a.key, name=a.name or None) for a in doc.get_authors()]

    work = book.works and book.works[0]
    d.authors = get_authors(work if work else book)
    d.work_key = work.key if work else book.key
    cover = work.get_cover() if work and work.get_cover() else book.get_cover()

    if cover:
        d.cover_url = cover.url("M")
    elif d.ocaid:
        d.cover_url = 'https://archive.org/services/img/%s' % d.ocaid

    if d.ocaid:
        collections = ia.get_metadata(d.ocaid).get('collection', [])

        if 'lendinglibrary' in collections or 'inlibrary' in collections:
            d.borrow_url = book.url("/borrow")
        else:
            d.read_url = book.url("/borrow")
    return d
示例#2
0
    def get_metadata(self, identifier):
        logger.info("find_metadata %s", identifier)

        if identifier in self.ia_cache:
            return self.ia_cache[identifier]

        return ia.get_metadata(identifier)
示例#3
0
def get_marc_record_from_ia(identifier):
    """
    Takes IA identifiers and returns MARC record instance.
    08/2018: currently called by openlibrary/plugins/importapi/code.py
    when the /api/import/ia endpoint is POSTed to.

    :param str identifier: ocaid
    :rtype: MarcXML | MarcBinary
    """
    metadata = ia.get_metadata(identifier)
    filenames = metadata['_filenames']

    marc_xml_filename = identifier + '_marc.xml'
    marc_bin_filename = identifier + '_meta.mrc'

    item_base = '{}{}/'.format(IA_DOWNLOAD_URL, identifier)

    # Try marc.xml first
    if marc_xml_filename in filenames:
        data = urlopen_keep_trying(item_base + marc_xml_filename).read()
        try:
            root = etree.fromstring(data)
            return MarcXml(root)
        except Exception as e:
            print("Unable to read MarcXML: %s" % e)
            traceback.print_exc()

    # If that fails, try marc.bin
    if marc_bin_filename in filenames:
        data = urlopen_keep_trying(item_base + marc_bin_filename).read()
        return MarcBinary(data)
示例#4
0
def get_marc_record_from_ia(identifier):
    """Takes IA identifiers and returns MARC record instance.
    11/2017: currently called by openlibrary/plugins/importapi/code.py
    when the /api/import/ia endpoint is POSTed to.
    """
    metadata = ia.get_metadata(identifier)
    filenames = metadata['_filenames']

    marc_xml_filename = identifier + "_marc.xml"
    marc_bin_filename = identifier + "_meta.mrc"

    item_base = base + "/" + identifier + "/"

    # Try marc.xml first
    if marc_xml_filename in filenames:
        data = urlopen_keep_trying(item_base + marc_xml_filename).read()
        try:
            root = etree.fromstring(data)
            return MarcXml(root)
        except Exception as e:
            print "Unable to read MarcXML: %s" % e
            traceback.print_exc()

    # If that fails, try marc.bin
    if marc_bin_filename in filenames:
        data = urlopen_keep_trying(item_base + marc_bin_filename).read()
        if len(data) == int(data[:5]):
            # This checks the reported data length against the actual data length
            # BinaryMARCs with incorrectly converted unicode characters do not match.
            return MarcBinary(data)
示例#5
0
def get_marc_record_from_ia(identifier):
    """Takes IA identifiers and returns MARC record instance.
    11/2017: currently called by openlibrary/plugins/importapi/code.py
    when the /api/import/ia endpoint is POSTed to.
    """
    metadata = ia.get_metadata(identifier)
    filenames = metadata['_filenames']

    marc_xml_filename = identifier + "_marc.xml"
    marc_bin_filename = identifier + "_meta.mrc"

    item_base = base + "/" + identifier + "/"

    # Try marc.xml first
    if marc_xml_filename in filenames:
        data = urlopen_keep_trying(item_base + marc_xml_filename).read()
        try:
            root = etree.fromstring(data)
            return MarcXml(root)
        except Exception as e:
            print "Unable to read MarcXML: %s" % e
            traceback.print_exc()

    # If that fails, try marc.bin
    if marc_bin_filename in filenames:
        data = urlopen_keep_trying(item_base + marc_bin_filename).read()
        if len(data) == int(data[:5]):
            # This checks the reported data length against the actual data length
            # BinaryMARCs with incorrectly converted unicode characters do not match.
            return MarcBinary(data)
示例#6
0
 def get_metadata(self, identifier):
     if identifier in self.ia_cache:
         logger.info("IA metadata cache hit")
         return self.ia_cache[identifier]
     else:
         logger.info("IA metadata cache miss")
         return ia.get_metadata(identifier)
示例#7
0
def get_marc_record_from_ia(identifier):
    """
    Takes IA identifiers and returns MARC record instance.
    08/2018: currently called by openlibrary/plugins/importapi/code.py
    when the /api/import/ia endpoint is POSTed to.

    :param str identifier: ocaid
    :rtype: MarcXML | MarcBinary
    """
    metadata = ia.get_metadata(identifier)
    filenames = metadata['_filenames']

    marc_xml_filename = identifier + '_marc.xml'
    marc_bin_filename = identifier + '_meta.mrc'

    item_base = '{}{}/'.format(IA_DOWNLOAD_URL, identifier)

    # Try marc.xml first
    if marc_xml_filename in filenames:
        data = urlopen_keep_trying(item_base + marc_xml_filename).read()
        try:
            root = etree.fromstring(data)
            return MarcXml(root)
        except Exception as e:
            print("Unable to read MarcXML: %s" % e)
            traceback.print_exc()

    # If that fails, try marc.bin
    if marc_bin_filename in filenames:
        data = urlopen_keep_trying(item_base + marc_bin_filename).read()
        return MarcBinary(data)
示例#8
0
    def get(self, sitename, data):
        key = data.get('key')

        itemid = self._get_itemid(key)
        if itemid:
            edition_key = self._find_edition(sitename, itemid)
            if edition_key:
                # Delete the store entry, indicating that this is no more is an item to be imported.
                self._ensure_no_store_entry(sitename, itemid)
                return self._make_redirect(itemid, edition_key)
            else:
                metadata = ia.get_metadata(itemid)
                doc = ia.edition_from_item_metadata(itemid, metadata)

                if doc is None:
                    # Delete store entry, if it exists.
                    # When an item is darked on archive.org, it should be
                    # automatically removed from OL. Removing entry from store
                    # will trigger the solr-updater to delete it from solr as well.
                    self._ensure_no_store_entry(sitename, itemid)

                    raise client.ClientException(
                        "404 Not Found", "notfound", 
                        simplejson.dumps({"key": "/books/ia:" + itemid, "error": "notfound"}))

                storedoc = self._ensure_store_entry(sitename, itemid)

                # Hack to add additional subjects /books/ia: pages
                # Adding subjects to store docs, will add thise subjects to the books.
                # These subjects are used when indexing the books in solr.
                if storedoc.get("subjects"):
                    doc.setdefault("subjects", []).extend(storedoc['subjects'])
                return simplejson.dumps(doc)
        else:
            return ConnectionMiddleware.get(self, sitename, data)
示例#9
0
    def get(self, sitename, data):
        key = data.get('key')

        itemid = self._get_itemid(key)
        if itemid:
            edition_key = self._find_edition(sitename, itemid)
            if edition_key:
                # Delete the store entry, indicating that this is no more is an item to be imported.
                self._ensure_no_store_entry(sitename, itemid)
                return self._make_redirect(itemid, edition_key)
            else:
                metadata = ia.get_metadata(itemid)
                doc = ia.edition_from_item_metadata(itemid, metadata)

                if doc is None:
                    # Delete store entry, if it exists.
                    # When an item is darked on archive.org, it should be
                    # automatically removed from OL. Removing entry from store
                    # will trigger the solr-updater to delete it from solr as well.
                    self._ensure_no_store_entry(sitename, itemid)

                    raise client.ClientException(
                        "404 Not Found", "notfound",
                        simplejson.dumps({"key": "/books/ia:" + itemid, "error": "notfound"}))

                storedoc = self._ensure_store_entry(sitename, itemid)

                # Hack to add additional subjects /books/ia: pages
                # Adding subjects to store docs, will add thise subjects to the books.
                # These subjects are used when indexing the books in solr.
                if storedoc.get("subjects"):
                    doc.setdefault("subjects", []).extend(storedoc['subjects'])
                return simplejson.dumps(doc)
        else:
            return ConnectionMiddleware.get(self, sitename, data)
示例#10
0
    def POST(self):
        web.header('Content-Type', 'application/json')

        if not can_write():
            return json.dumps({'success': False, 'error': 'Permission Denied'})

        i = web.input()
        if "identifier" not in i:
            self.error("bad-input", "identifier not provided")
        identifier = i.identifier

        # Case 0 - Is the item already loaded
        key = self.find_edition(identifier)
        if key:
            return self.status_matched(key)

        # Case 1 - Is this a valid item?
        metadata = ia.get_metadata(identifier)
        if not metadata:
            return self.error("invalid-ia-identifier")

        # Case 2 - Is the item has openlibrary field specified?
        # The scan operators search OL before loading the book and adds the
        # OL key if an match is found. We can trust them as attach the item
        # to that edition.
        if metadata.get("mediatype") == "texts" and metadata.get(
                "openlibrary"):
            d = {
                "title": metadata['title'],
                "openlibrary": "/books/" + metadata["openlibrary"]
            }
            d = self.populate_edition_data(d, identifier)
            return self.load_book(d)

        # Case 3 - Can the item be loaded into Open Library?
        status = ia.get_item_status(identifier, metadata)
        if status != 'ok':
            return self.error(status, "Prohibited Item")

        # Gio - April 2016
        # items with metadata no_ol_import=true will be not imported
        if metadata.get("no_ol_import") == 'true' or metadata.get(
                "no_ol_import") == 'True':
            return self.error("no-ol-import")

        # Case 4 - Does this item have a marc record?
        marc_record = self.get_marc_record(identifier)
        if not marc_record:
            return self.error("no-marc-record")

        # Case 5 - Is the item a serial instead of a book?
        if marc_record.leader()[7] == 's':
            return self.error("item-is-serial")

        edition_data = self.get_edition_data(identifier, marc_record)
        if not edition_data:
            return self.error("invalid-marc-record")
        return self.load_book(edition_data)
示例#11
0
def get_ia_availability(itemid):
    collections = ia.get_metadata(itemid).get('collection', [])

    if 'lendinglibrary' in collections or 'inlibrary' in collections:
        return 'borrow'
    elif 'printdisabled' in collections:
        return 'restricted'
    else:
        return 'full'
示例#12
0
    def POST(self):
        web.header('Content-Type', 'application/json')

        if not can_write():
            return json.dumps({'success':False, 'error':'Permission Denied'})

        i = web.input()
        if "identifier" not in i:
            self.error("bad-input", "identifier not provided")
        identifier = i.identifier

        # Case 0 - Is the item already loaded
        key = self.find_edition(identifier)
        if key:
            return self.status_matched(key)

        # Case 1 - Is this a valid item?
        metadata = ia.get_metadata(identifier)
        if not metadata:
            return self.error("invalid-ia-identifier")

        # Case 2 - Is the item has openlibrary field specified?
        # The scan operators search OL before loading the book and adds the
        # OL key if an match is found. We can trust them as attach the item
        # to that edition.
        if metadata.get("mediatype") == "texts" and metadata.get("openlibrary"):
            d = {
                "title": metadata['title'],
                "openlibrary": "/books/" + metadata["openlibrary"]
            }
            d = self.populate_edition_data(d, identifier)
            return self.load_book(d)

        # Case 3 - Can the item be loaded into Open Library?
        status = ia.get_item_status(identifier, metadata)
        if status != 'ok':
            return self.error(status, "Prohibited Item")

        # Gio - April 2016
        # items with metadata no_ol_import=true will be not imported
        if metadata.get("no_ol_import") == 'true' or metadata.get("no_ol_import") == 'True':
            return self.error("no-ol-import")

        # Case 4 - Does this item have a marc record?
        marc_record = self.get_marc_record(identifier)
        if not marc_record:
            return self.error("no-marc-record")

        # Case 5 - Is the item a serial instead of a book?
        if marc_record.leader()[7] == 's':
            return self.error("item-is-serial")

        edition_data = self.get_edition_data(identifier, marc_record)
        if not edition_data:
            return self.error("invalid-marc-record")
        return self.load_book(edition_data)
示例#13
0
    def ia_import(cls, identifier, require_marc=True, force_import=False):
        """
        Performs logic to fetch archive.org item + metadata,
        produces a data dict, then loads into Open Library

        :param str identifier: archive.org ocaid
        :param bool require_marc: require archive.org item have MARC record?
        :param bool force_import: force import of this record
        :rtype: dict
        :returns: the data of the imported book or raises  BookImportError
        """
        # Case 1 - Is this a valid Archive.org item?
        metadata = ia.get_metadata(identifier)
        if not metadata:
            raise BookImportError('invalid-ia-identifier',
                                  '%s not found' % identifier)

        # Case 2 - Does the item have an openlibrary field specified?
        # The scan operators search OL before loading the book and add the
        # OL key if a match is found. We can trust them and attach the item
        # to that edition.
        if metadata.get('mediatype') == 'texts' and metadata.get(
                'openlibrary'):
            edition_data = cls.get_ia_record(metadata)
            edition_data['openlibrary'] = metadata['openlibrary']
            edition_data = cls.populate_edition_data(edition_data, identifier)
            return cls.load_book(edition_data)

        # Case 3 - Can the item be loaded into Open Library?
        status = ia.get_item_status(identifier, metadata)
        if status != 'ok' and not force_import:
            raise BookImportError(status, 'Prohibited Item %s' % identifier)

        # Case 4 - Does this item have a marc record?
        marc_record = get_marc_record_from_ia(identifier)
        if require_marc and not marc_record:
            raise BookImportError('no-marc-record')
        if marc_record:
            if not force_import:
                raise_non_book_marc(marc_record)
            try:
                edition_data = read_edition(marc_record)
            except MarcException as e:
                logger.error('failed to read from MARC record %s: %s',
                             identifier, str(e))
                raise BookImportError('invalid-marc-record')
        else:
            try:
                edition_data = cls.get_ia_record(metadata)
            except KeyError:
                raise BookImportError('invalid-ia-metadata')

        # Add IA specific fields: ocaid, source_records, and cover
        edition_data = cls.populate_edition_data(edition_data, identifier)
        return cls.load_book(edition_data)
示例#14
0
    def get_ia_meta_fields(self):
        # Check for cached value
        # $$$ we haven't assigned _ia_meta_fields the first time around but there's apparently
        #     some magic that lets us check this way (and breaks using hasattr to check if defined)
        if self._ia_meta_fields:
            return self._ia_meta_fields

        if not self.get('ocaid', None):
            meta = {}
        else:
            meta = ia.get_metadata(self.ocaid)
            meta.setdefault('external-identifier', [])
            meta.setdefault('collection', [])

        self._ia_meta_fields = meta
        return self._ia_meta_fields
示例#15
0
def test_get_metadata(monkeypatch, mock_memcache):
    metadata = {
        "metadata": {
            "title": "Foo",
            "identifier": "foo00bar",
            "collection": ["printdisabled", "inlibrary"],
        }
    }

    monkeypatch.setattr(ia, 'get_api_response', lambda *args: metadata)
    assert ia.get_metadata('foo00bar') == {
        "title": "Foo",
        "identifier": "foo00bar",
        "collection": ["printdisabled", "inlibrary"],
        "access-restricted": False,
        "_filenames": [],
    }
示例#16
0
def get_marc_record_from_ia(identifier):
    """Takes IA identifiers and returns MARC record instance.
    """
    metadata = ia.get_metadata(identifier)
    filenames = metadata['_filenames']

    marc_xml_filename = identifier + "_marc.xml"
    marc_bin_filename = identifier + "_meta.mrc"

    item_base = base + "/" + identifier + "/"

    # Try marc.xml first
    if marc_xml_filename in filenames:
        data = urlopen_keep_trying(item_base + marc_xml_filename).read()
        if data[:10].find('<?xml') != -1:
            root = etree.fromstring(data)
            return MarcXml(root)

    # If that fails, try marc.bin
    if marc_bin_filename in filenames:
        data = urlopen_keep_trying(item_base + marc_bin_filename).read()
        if len(data) == int(data[:5]):
            return MarcBinary(data)
示例#17
0
def get_marc_record_from_ia(identifier):
    """Takes IA identifiers and returns MARC record instance.
    """
    metadata = ia.get_metadata(identifier)
    filenames = metadata['_filenames']

    marc_xml_filename = identifier + "_marc.xml"
    marc_bin_filename = identifier + "_meta.mrc"

    item_base = base + "/" + identifier + "/"

    # Try marc.xml first
    if marc_xml_filename in filenames:
        data = urlopen_keep_trying(item_base + marc_xml_filename).read()
        if data[:10].find('<?xml') != -1:
            root = etree.fromstring(data)
            return MarcXml(root)

    # If that fails, try marc.bin
    if marc_bin_filename in filenames:
        data = urlopen_keep_trying(item_base + marc_bin_filename).read()
        if len(data) == int(data[:5]):
            return MarcBinary(data)
示例#18
0
def parse_data(data):
    """
    Takes POSTed data and determines the format, and returns an Edition record
    suitable for adding to OL.

    :param str data: Raw data
    :rtype: (dict|None, str|None)
    :return: (Edition record, format (rdf|opds|marcxml|json|marc)) or (None, None)
    """
    data = data.strip()
    if -1 != data[:10].find('<?xml'):
        root = etree.fromstring(data)
        if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag:
            edition_builder = import_rdf.parse(root)
            format = 'rdf'
        elif '{http://www.w3.org/2005/Atom}entry' == root.tag:
            edition_builder = import_opds.parse(root)
            format = 'opds'
        elif '{http://www.loc.gov/MARC21/slim}record' == root.tag:
            if root.tag == '{http://www.loc.gov/MARC21/slim}collection':
                root = root[0]
            rec = MarcXml(root)
            edition = read_edition(rec)
            edition_builder = import_edition_builder.import_edition_builder(init_dict=edition)
            format = 'marcxml'
        else:
            print 'unrecognized XML format'
            return None, None
    elif data.startswith('{') and data.endswith('}'):
        obj = json.loads(data)
        edition_builder = import_edition_builder.import_edition_builder(init_dict=obj)
        format = 'json'
    else:
        # Special case to load IA records, DEPRECATED: use import/ia endpoint
        # Just passing ia:foo00bar is enough to load foo00bar from IA.
        if data.startswith("ia:"):
            source_records = [data]
            itemid = data[len("ia:"):]

            metadata = ia.get_metadata(itemid)
            if not metadata:
                raise DataError("invalid-ia-identifier")

            # see ia_importapi to address `imagecount` limitations
            status = ia.get_item_status(itemid, metadata)
            if status != 'ok':
                raise DataError(status)

            try:
                rec = get_marc_record_from_ia(itemid)

                # skip serials
                if rec and rec.leader()[7] == 's':
                    raise DataError("item-is-serial")
            except IOError:
                raise DataError("no-marc-record")

            if not rec:
                raise DataError("no-marc-record")
        else:
            source_records = None
            itemid = None

            #Marc Binary
            if len(data) != int(data[:5]):
                return json.dumps({'success':False, 'error':'Bad MARC length'})

            rec = MarcBinary(data)

        edition = read_edition(rec)
        if source_records:
            edition['source_records'] = source_records
            edition['ocaid'] = itemid
        edition_builder = import_edition_builder.import_edition_builder(init_dict=edition)
        format = 'marc'

    parse_meta_headers(edition_builder)

    return edition_builder.get_dict(), format
示例#19
0
    def _get_ia_item(self, itemid):
        timestamp = {"type": "/type/datetime", "value": "2010-01-01T00:00:00"}
        metadata = ia.get_metadata(itemid)

        if not self._is_valid_item(itemid, metadata):
            raise client.ClientException("404 Not Found", "notfound", simplejson.dumps({"key": "/books/ia:" + itemid}))

        d = {   
            "key": "/books/ia:" + itemid,
            "type": {"key": "/type/edition"}, 
            "title": itemid, 
            "ocaid": itemid,
            "revision": 1,
            "created": timestamp,
            "last_modified": timestamp
        }

        def add(key, key2=None):
            key2 = key2 or key
            # sometimes the empty values are represneted as {} in metadata API. Avoid them.
            if key in metadata and metadata[key] != {}:
                value = metadata[key]
                if isinstance(value, list):
                    value = [v for v in value if v != {}]
                    if value:
                        if isinstance(value[0], basestring):
                            value = "\n\n".join(value)
                        else:
                            value = value[0]
                    else:
                        # empty list. Ignore.
                        return

                d[key2] = value

        def add_list(key, key2):
            key2 = key2 or key
            # sometimes the empty values are represneted as {} in metadata API. Avoid them.
            if key in metadata and metadata[key] != {}:
                value = metadata[key]
                if not isinstance(value, list):
                    value = [value]
                d[key2] = value

        def add_isbns():
            isbns = metadata.get('isbn')
            isbn_10 = []
            isbn_13 = []
            if isbns:
                for isbn in isbns:
                    isbn = isbn.replace("-", "").strip()
                    if len(isbn) == 13:
                        isbn_13.append(isbn)
                    elif len(isbn) == 10:
                        isbn_10.append(isbn)
            if isbn_10:
                d["isbn_10"] = isbn_10
            if isbn_13: 
                d["isbn_13"] = isbn_13

        def add_subjects():
            collections = metadata.get("collection", [])
            mapping = {
                "inlibrary": "In library",
                "lendinglibrary": "Lending library"
            }
            subjects = [subject for c, subject in mapping.items() if c in collections]
            if subjects:
                d['subjects'] = subjects

        add('title')
        add('description', 'description')
        add_list('publisher', 'publishers')
        add_list("creator", "author_names")
        add('date', 'publish_date')

        add_isbns()
        add_subjects()
        
        return d
示例#20
0
    def POST(self):
        web.header('Content-Type', 'application/json')

        if not can_write():
            raise web.HTTPError('403 Forbidden')

        i = web.input()

        require_marc = not (i.get('require_marc') == 'false')
        bulk_marc = i.get('bulk_marc') == 'true'

        if 'identifier' not in i:
            return self.error('bad-input', 'identifier not provided')
        identifier = i.identifier

        # First check whether this is a non-book, bulk-marc item
        if bulk_marc:
            # Get binary MARC by identifier = ocaid/filename:offset:length
            re_bulk_identifier = re.compile("([^/]*)/([^:]*):(\d*):(\d*)")
            try:
                ocaid, filename, offset, length = re_bulk_identifier.match(
                    identifier).groups()
                data, next_offset, next_length = get_from_archive_bulk(
                    identifier)
                next_data = {
                    'next_record_offset': next_offset,
                    'next_record_length': next_length
                }
                rec = MarcBinary(data)
                edition = read_edition(rec)
            except MarcException as e:
                details = "%s: %s" % (identifier, str(e))
                logger.error("failed to read from bulk MARC record %s",
                             details)
                return self.error('invalid-marc-record', details, **next_data)

            actual_length = int(rec.leader()[:MARC_LENGTH_POS])
            edition['source_records'] = 'marc:%s/%s:%s:%d' % (
                ocaid, filename, offset, actual_length)

            local_id = i.get('local_id')
            if local_id:
                local_id_type = web.ctx.site.get('/local_ids/' + local_id)
                prefix = local_id_type.urn_prefix
                id_field, id_subfield = local_id_type.id_location.split('$')

                def get_subfield(field, id_subfield):
                    if isinstance(field, str):
                        return field
                    subfields = field[1].get_subfield_values(id_subfield)
                    return subfields[0] if subfields else None

                _ids = [
                    get_subfield(f, id_subfield)
                    for f in rec.read_fields([id_field])
                    if f and get_subfield(f, id_subfield)
                ]
                edition['local_id'] = [
                    'urn:%s:%s' % (prefix, _id) for _id in _ids
                ]

            # Don't add the book if the MARC record is a non-book item
            self.reject_non_book_marc(rec, **next_data)
            result = add_book.load(edition)

            # Add next_data to the response as location of next record:
            result.update(next_data)
            return json.dumps(result)

        # Case 1 - Is this a valid Archive.org item?
        metadata = ia.get_metadata(identifier)
        if not metadata:
            return self.error('invalid-ia-identifier',
                              '%s not found' % identifier)

        # Case 2 - Does the item have an openlibrary field specified?
        # The scan operators search OL before loading the book and add the
        # OL key if a match is found. We can trust them and attach the item
        # to that edition.
        if metadata.get('mediatype') == 'texts' and metadata.get(
                'openlibrary'):
            edition_data = self.get_ia_record(metadata)
            edition_data['openlibrary'] = metadata['openlibrary']
            edition_data = self.populate_edition_data(edition_data, identifier)
            return self.load_book(edition_data)

        # Case 3 - Can the item be loaded into Open Library?
        status = ia.get_item_status(identifier, metadata)
        if status != 'ok':
            return self.error(status, 'Prohibited Item %s' % identifier)

        # Case 4 - Does this item have a marc record?
        marc_record = get_marc_record_from_ia(identifier)
        if marc_record:
            self.reject_non_book_marc(marc_record)
            try:
                edition_data = read_edition(marc_record)
            except MarcException as e:
                logger.error('failed to read from MARC record %s: %s',
                             identifier, str(e))
                return self.error('invalid-marc-record')
        elif require_marc:
            return self.error('no-marc-record')
        else:
            try:
                edition_data = self.get_ia_record(metadata)
            except KeyError:
                return self.error("invalid-ia-metadata")

        # Add IA specific fields: ocaid, source_records, and cover
        edition_data = self.populate_edition_data(edition_data, identifier)
        return self.load_book(edition_data)
示例#21
0
def parse_data(data):
    data = data.strip()
    if -1 != data[:10].find('<?xml'):
        root = etree.fromstring(data)
        #print root.tag
        if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag:
            edition_builder = import_rdf.parse(root)
            format = 'rdf'
        elif '{http://www.w3.org/2005/Atom}entry' == root.tag:
            edition_builder = import_opds.parse(root)
            format = 'opds'
        elif '{http://www.loc.gov/MARC21/slim}record' == root.tag:
            if root.tag == '{http://www.loc.gov/MARC21/slim}collection':
                root = root[0]
            rec = MarcXml(root)
            edition = read_edition(rec)
            edition_builder = import_edition_builder.import_edition_builder(init_dict=edition)
            format = 'marcxml'
        else:
            print 'unrecognized XML format'
            return None, None
    elif data.startswith('{') and data.endswith('}'):
        obj = json.loads(data)
        edition_builder = import_edition_builder.import_edition_builder(init_dict=obj)
        format = 'json'
    else:
        # Special case to load IA records
        # Just passing ia:foo00bar is enough to load foo00bar from IA.
        if data.startswith("ia:"):
            source_records = [data]
            itemid = data[len("ia:"):]

            metadata = ia.get_metadata(itemid)
            if not metadata:
                raise DataError("invalid-ia-identifier")

            if not ia.edition_from_item_metadata(itemid, metadata):
                raise DataError("item-not-a-book")

            try:
                rec = get_marc_record_from_ia(itemid)

                # skip serials
                if rec.leader()[7] == 's':
                    raise DataError("item-is-serial")
            except IOError:
                raise DataError("no-marc-record")

            if not rec:
                raise DataError("no-marc-record")
        else:
            source_records = None
            itemid = None

            #Marc Binary
            if len(data) != int(data[:5]):
                return json.dumps({'success':False, 'error':'Bad MARC length'})

            rec = MarcBinary(data)

        edition = read_edition(rec)
        if source_records:
            edition['source_records'] = source_records
            edition['ocaid'] = itemid
        edition_builder = import_edition_builder.import_edition_builder(init_dict=edition)
        format = 'marc'

    parse_meta_headers(edition_builder)
    
    return edition_builder.get_dict(), format
示例#22
0
    def process(self, req):
        requests = req.split('|')
        bib_keys = sum([r.split(';') for r in requests], [])

        # filter out 'id:foo' before passing to dynlinks
        bib_keys = [k for k in bib_keys if k[:3].lower() != 'id:']

        self.docs = dynlinks.query_docs(bib_keys)
        if not self.options.get('no_details'):
            self.detailss = dynlinks.process_result_for_details(self.docs)
        else:
            self.detailss = {}
        dp = dynlinks.DataProcessor()
        self.datas = dp.process(self.docs)
        self.works = dp.works

        # XXX control costs below with [:iaid_limit] - note that this may result
        # in no 'exact' item match, even if one exists
        # Note that it's available thru above works/docs
        iaid_limit = 500
        self.wkey_to_iaids = dict(
            (wkey, get_work_iaids(wkey)[:iaid_limit]) for wkey in self.works)
        iaids = sum(self.wkey_to_iaids.values(), [])
        self.iaid_to_meta = dict(
            (iaid, ia.get_metadata(iaid)) for iaid in iaids)

        def lookup_iaids(iaids):
            step = 10
            if len(iaids) > step and not self.options.get('debug_things'):
                result = []
                while iaids:
                    result += lookup_iaids(iaids[:step])
                    iaids = iaids[step:]
                return result
            query = {
                'type': '/type/edition',
                'ocaid': iaids,
            }
            result = web.ctx.site.things(query)
            return result

        ekeys = lookup_iaids(iaids)

        # If returned order were reliable, I could skip the below.
        eds = dynlinks.ol_get_many_as_dict(ekeys)
        self.iaid_to_ed = dict((ed['ocaid'], ed) for ed in eds.values())
        # self.iaid_to_ekey = dict((iaid, ed['key'])
        #                            for iaid, ed in self.iaid_to_ed.items())

        # Work towards building a dict of iaid loanability,
        # def has_lending_collection(meta):
        #     collections = meta.get("collection", [])
        #     return 'lendinglibrary' in collections or 'inlibrary' in collections
        # in case site.store supports get_many (unclear)
        # maybe_loanable_iaids = [iaid for iaid in iaids
        #                         if has_lending_collection(self.iaid_to_meta.get(iaid, {}))]
        # loanable_ekeys = [self.iaid_to_ekey.get(iaid) for iaid in maybe_loanable_iaids]
        # loanstatus =  web.ctx.site.store.get('ebooks' + ekey, {'borrowed': 'false'})

        result = {}
        for r in requests:
            bib_keys = r.split(';')
            if r.lower().startswith('id:'):
                result_key = bib_keys.pop(0)[3:]
            else:
                result_key = r
            sub_result = self.make_record(bib_keys)
            if sub_result:
                result[result_key] = sub_result

        if self.options.get('debug_items'):
            result['ekeys'] = ekeys
            result['eds'] = eds
            result['iaids'] = iaids

        return result
示例#23
0
def test_get_metadata_empty(monkeypatch, mock_memcache):
    monkeypatch.setattr(ia, 'get_api_response', lambda *args: {})
    assert ia.get_metadata('foo02bar') == {}
示例#24
0
 def get_metadata(self, identifier):
     logger.info("find_metadata %s", identifier)
     return ia.get_metadata(identifier)
示例#25
0
 def get_metadata(self, identifier):
     logger.info("find_metadata %s", identifier)
     return ia.get_metadata(identifier)
示例#26
0
 def get_metadata(self, identifier: str):
     return ia.get_metadata(identifier)
示例#27
0
    def _get_ia_item(self, itemid):
        timestamp = {"type": "/type/datetime", "value": "2010-01-01T00:00:00"}
        metadata = ia.get_metadata(itemid)

        if not self._is_valid_item(itemid, metadata):
            return None

        d = {
            "key": "/books/ia:" + itemid,
            "type": {
                "key": "/type/edition"
            },
            "title": itemid,
            "ocaid": itemid,
            "revision": 1,
            "created": timestamp,
            "last_modified": timestamp
        }

        def add(key, key2=None):
            key2 = key2 or key
            # sometimes the empty values are represneted as {} in metadata API. Avoid them.
            if key in metadata and metadata[key] != {}:
                value = metadata[key]
                if isinstance(value, list):
                    value = [v for v in value if v != {}]
                    if value:
                        if isinstance(value[0], basestring):
                            value = "\n\n".join(value)
                        else:
                            value = value[0]
                    else:
                        # empty list. Ignore.
                        return

                d[key2] = value

        def add_list(key, key2):
            key2 = key2 or key
            # sometimes the empty values are represneted as {} in metadata API. Avoid them.
            if key in metadata and metadata[key] != {}:
                value = metadata[key]
                if not isinstance(value, list):
                    value = [value]
                d[key2] = value

        def add_isbns():
            isbns = metadata.get('isbn')
            isbn_10 = []
            isbn_13 = []
            if isbns:
                for isbn in isbns:
                    isbn = isbn.replace("-", "").strip()
                    if len(isbn) == 13:
                        isbn_13.append(isbn)
                    elif len(isbn) == 10:
                        isbn_10.append(isbn)
            if isbn_10:
                d["isbn_10"] = isbn_10
            if isbn_13:
                d["isbn_13"] = isbn_13

        def add_subjects():
            collections = metadata.get("collection", [])
            mapping = {
                "inlibrary": "In library",
                "lendinglibrary": "Lending library"
            }
            subjects = [
                subject for c, subject in mapping.items() if c in collections
            ]
            if subjects:
                d['subjects'] = subjects

        add('title')
        add('description', 'description')
        add_list('publisher', 'publishers')
        add_list("creator", "author_names")
        add('date', 'publish_date')

        add_isbns()
        add_subjects()

        return d
示例#28
0
def parse_data(data):
    data = data.strip()
    if -1 != data[:10].find('<?xml'):
        root = etree.fromstring(data)
        #print root.tag
        if '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' == root.tag:
            edition_builder = import_rdf.parse(root)
            format = 'rdf'
        elif '{http://www.w3.org/2005/Atom}entry' == root.tag:
            edition_builder = import_opds.parse(root)
            format = 'opds'
        elif '{http://www.loc.gov/MARC21/slim}record' == root.tag:
            if root.tag == '{http://www.loc.gov/MARC21/slim}collection':
                root = root[0]
            rec = MarcXml(root)
            edition = read_edition(rec)
            edition_builder = import_edition_builder.import_edition_builder(
                init_dict=edition)
            format = 'marcxml'
        else:
            print 'unrecognized XML format'
            return None, None
    elif data.startswith('{') and data.endswith('}'):
        obj = json.loads(data)
        edition_builder = import_edition_builder.import_edition_builder(
            init_dict=obj)
        format = 'json'
    else:
        # Special case to load IA records
        # Just passing ia:foo00bar is enough to load foo00bar from IA.
        if data.startswith("ia:"):
            source_records = [data]
            itemid = data[len("ia:"):]

            metadata = ia.get_metadata(itemid)
            if not metadata:
                raise DataError("invalid-ia-identifier")

            if not ia.edition_from_item_metadata(itemid, metadata):
                raise DataError("item-not-a-book")

            try:
                rec = get_marc_record_from_ia(itemid)
            except IOError:
                raise DataError("no-marc-record")

            if not rec:
                raise DataError("no-marc-record")
        else:
            source_records = None
            itemid = None

            #Marc Binary
            if len(data) != int(data[:5]):
                return json.dumps({
                    'success': False,
                    'error': 'Bad MARC length'
                })

            rec = MarcBinary(data)

        edition = read_edition(rec)
        if source_records:
            edition['source_records'] = source_records
            edition['ocaid'] = itemid
        edition_builder = import_edition_builder.import_edition_builder(
            init_dict=edition)
        format = 'marc'

    parse_meta_headers(edition_builder)

    return edition_builder.get_dict(), format
示例#29
0
def test_get_metadata_empty(monkeypatch, mock_memcache):
    monkeypatch.setattr(ia, '_get_metadata', lambda _id: {})
    assert ia.get_metadata('foo02bar') == {}