def load_batch(self, batch_path): """Load a batch, and return a Batch instance for the batch that was loaded. loader.load_batch('/path/to/batch_curiv_ahwahnee_ver01') """ self.pages_processed = 0 # Trailing slash breaks comparison to link_name below, so strip off batch_path = batch_path.rstrip("/") _logger.info("loading batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path) if dirname: batch_source = None link_name = os.path.join(settings.BATCH_STORAGE, batch_name) # Create symlink if paths don't match, symlink not already there, # and batch_path wasn't input with a BATCH_STORAGE symlink path if (batch_path != link_name and not os.path.islink(link_name) and not (os.path.islink(settings.BATCH_STORAGE) and batch_path.startswith( os.path.realpath(settings.BATCH_STORAGE)))): _logger.info("creating symlink %s -> %s", batch_path, link_name) os.symlink(batch_path, link_name) else: batch_source = urllib.parse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) try: batch = Batch.objects.get(name=batch_name) _logger.info("Batch already loaded: %s" % batch_name) return batch except Batch.DoesNotExist as e: pass _logger.info("loading batch: %s" % batch_name) t0 = time() times = [] event = LoadBatchEvent(batch_name=batch_name, message="starting load") event.save() batch = None try: # build a Batch object for the batch location batch = self._get_batch(batch_name, batch_source, create=True) self._sanity_check_batch(batch) # stash it away for processing later on self.current_batch = batch # parse the batch.xml and load up each issue mets file doc = etree.parse(batch.validated_batch_url) for e in doc.xpath('ndnp:reel', namespaces=ns): reel_number = e.attrib['reelNumber'].strip() try: reel = models.Reel.objects.get(number=reel_number, batch=batch) except models.Reel.DoesNotExist as e: reel = models.Reel(number=reel_number, batch=batch) reel.save() for e in doc.xpath('ndnp:issue', namespaces=ns): mets_url = urllib.parse.urljoin(batch.storage_url, e.text) try: issue = self._load_issue(mets_url) except ValueError as e: _logger.exception(e) continue reset_queries() times.append((time() - t0, self.pages_processed)) # commit new changes to the solr index, if we are indexing if self.PROCESS_OCR: self.solr.commit() batch.save() msg = "processed %s pages" % batch.page_count event = LoadBatchEvent(batch_name=batch_name, message=msg) _logger.info(msg) event.save() except Exception as e: msg = "unable to load batch: %s" % e _logger.error(msg) _logger.exception(e) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() try: self.purge_batch(batch_name) except Exception as pbe: _logger.error("purge batch failed for failed load batch: %s" % pbe) _logger.exception(pbe) raise BatchLoaderException(msg) # updates the min and max years of all titles set_fulltext_range() return batch
def _load_page(self, doc, div, issue): dmdid = div.attrib['DMDID'] mods = dmd_mods(doc, dmdid) page = Page() seq_string = mods.xpath('string(.//mods:extent/mods:start)', namespaces=ns) try: page.sequence = int(seq_string) except ValueError as e: raise BatchLoaderException( "could not determine sequence number for page from '%s'" % seq_string) page.number = mods.xpath('string(.//mods:detail[@type="page number"])', namespaces=ns).strip() reel_number = mods.xpath( 'string(.//mods:identifier[@type="reel number"])', namespaces=ns).strip() try: reel = models.Reel.objects.get(number=reel_number, batch=self.current_batch) page.reel = reel except models.Reel.DoesNotExist as e: if reel_number: reel = models.Reel(number=reel_number, batch=self.current_batch, implicit=True) reel.save() page.reel = reel else: _logger.warn("unable to find reel number in page metadata") _logger.info("Assigned page sequence: %s" % page.sequence) _section_dmdid = div.xpath( 'string(ancestor::mets:div[@TYPE="np:section"]/@DMDID)', namespaces=ns) if _section_dmdid: section_mods = dmd_mods(doc, _section_dmdid) section_label = section_mods.xpath( 'string(.//mods:detail[@type="section label"]/mods:number[1])', namespaces=ns).strip() if section_label: page.section_label = section_label page.issue = issue _logger.info("Saving page. issue date: %s, page sequence: %s" % (issue.date_issued, page.sequence)) # TODO - consider the possibility of executing the file name # assignments (below) before this page.save(). page.save() notes = [] for mods_note in mods.xpath('.//mods:note', namespaces=ns): type = mods_note.xpath('string(./@type)') label = mods_note.xpath('string(./@displayLabel)') text = mods_note.xpath('string(.)').strip() note = models.PageNote(type=type, label=label, text=text) notes.append(note) page.notes.set(notes, bulk=False) # there's a level indirection between the METS structmap and the # details about specific files in this package ... # so we have to first get the FILEID from the issue div in the # structmap and then use it to look up the file details in the # larger document. for fptr in div.xpath('./mets:fptr', namespaces=ns): file_id = fptr.attrib['FILEID'] file_el = doc.xpath('.//mets:file[@ID="%s"]' % file_id, namespaces=ns)[0] file_type = file_el.attrib['USE'] # get the filename relative to the storage location file_name = file_el.xpath('string(./mets:FLocat/@xlink:href)', namespaces=ns) file_name = urllib.parse.urljoin(doc.docinfo.URL, file_name) file_name = self.storage_relative_path(file_name) if file_type == 'master': page.tiff_filename = file_name elif file_type == 'service': page.jp2_filename = file_name try: # extract image dimensions from technical metadata for jp2 for admid in file_el.attrib['ADMID'].split(' '): length, width = get_dimensions(doc, admid) if length and width: page.jp2_width = width page.jp2_length = length break except KeyError as e: _logger.info( "Could not determine dimensions of jp2 for issue: %s page: %s... trying harder..." % (page.issue, page)) im = Image.open(page.jp2_abs_filename) page.jp2_width, page.jp2_length = im.size if not page.jp2_width: raise BatchLoaderException( "No jp2 width for issue: %s page: %s" % (page.issue, page)) if not page.jp2_length: raise BatchLoaderException( "No jp2 length for issue: %s page: %s" % (page.issue, page)) elif file_type == 'derivative': page.pdf_filename = file_name elif file_type == 'ocr': page.ocr_filename = file_name if page.ocr_filename: # don't incurr overhead of extracting ocr text, word coordinates # and indexing unless the batch loader has been set up to do it if self.PROCESS_OCR: self.process_ocr(page) else: _logger.info("No ocr filename for issue: %s page: %s" % (page.issue, page)) _logger.debug("saving page: %s" % page.url) page.save() return page
# stash it away for processing later on self.current_batch = batch # parse the batch.xml and load up each issue mets file doc = etree.parse(batch.validated_batch_url) for e in doc.xpath('ndnp:reel', namespaces=ns): reel_number = e.attrib['reelNumber'].strip() try: reel = models.Reel.objects.get(number=reel_number, batch=batch) except models.Reel.DoesNotExist, e: reel = models.Reel(number=reel_number, batch=batch) reel.save() for e in doc.xpath('ndnp:issue', namespaces=ns): mets_url = urlparse.urljoin(batch.storage_url, e.text) try: issue = self._load_issue(mets_url) except ValueError, e: _logger.exception(e) continue reset_queries() times.append((time() - t0, self.pages_processed)) # commit new changes to the solr index, if we are indexing if self.PROCESS_OCR: self.solr.commit()