def purge_batch(self, batch_name): event = LoadBatchEvent(batch_name=batch_name, message="starting purge") event.save() try: batch = self._get_batch(batch_name) self._purge_batch(batch) event = LoadBatchEvent(batch_name=batch_name, message="purged") event.save() # clean up symlinks if exists link_name = os.path.join(settings.BATCH_STORAGE, batch_name) if os.path.islink(link_name): LOGGER.info("Removing symlink %s", link_name) os.remove(link_name) except Exception as e: msg = "purge failed: %s" % e LOGGER.exception(msg) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() raise BatchLoaderException(msg)
batch_source += "/" batch_name = _normalize_batch_name(batch_name) if not strict: try: batch = Batch.objects.get(name=batch_name) _logger.info("Batch already loaded: %s" % batch_name) return batch except Batch.DoesNotExist, e: pass _logger.info("loading batch: %s" % batch_name) t0 = time() times = [] event = LoadBatchEvent(batch_name=batch_name, message="starting load") event.save() batch = None try: # build a Batch object for the batch location batch = self._get_batch(batch_name, batch_source, create=True) self._sanity_check_batch(batch) # stash it away for processing later on self.current_batch = batch # parse the batch.xml and load up each issue mets file doc = etree.parse(batch.validated_batch_url) for e in doc.xpath('ndnp:reel', namespaces=ns):
class BatchLoader(object): """This class allows you to load a batch into the database. A loader object serves as a context for a particular batch loading job. """ def __init__(self, process_ocr=True, process_coordinates=True, in_copyright=False): """Create a BatchLoader. The process_ocr parameter is used (mainly in testing) when we don't want to spend time actually extracting ocr text and indexing. """ self.PROCESS_OCR = process_ocr if self.PROCESS_OCR: self.solr = SolrConnection(settings.SOLR) self.PROCESS_COORDINATES = process_coordinates self.IN_COPYRIGHT = in_copyright def _find_batch_file(self, batch): # look for batch_1.xml, BATCH_1.xml, etc for alias in [ "batch_1.xml", "BATCH_1.xml", "batchfile_1.xml", "batch_2.xml", "BATCH_2.xml", "batch.xml" ]: if os.path.isfile(os.path.join(batch.storage_url, alias)): return alias else: continue else: raise BatchLoaderException( "could not find batch_1.xml (or any of its aliases) in '%s' -- has the batch been validated?" % batch.path) def _sanity_check_batch(self, batch): if not os.path.exists(batch.path): raise BatchLoaderException("batch does not exist at %s" % batch.path) # b = urllib2.urlopen(batch.url) batch.validated_batch_file = self._find_batch_file(batch) def load_batch(self, batch_path, strict=True): """Load a batch, and return a Batch instance for the batch that was loaded. loader.load_batch('/path/to/batch_curiv_ahwahnee_ver01') """ self.pages_processed = 0 logging.info("loading batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path.rstrip("/")) if dirname: batch_source = None link_name = os.path.join(settings.BATCH_STORAGE, batch_name) if batch_path != link_name and not os.path.islink(link_name): _logger.info("creating symlink %s -> %s", batch_path, link_name) os.symlink(batch_path, link_name) else: batch_source = os.path.join(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) if not strict: try: batch = Batch.objects.get(name=batch_name) _logger.info("Batch already loaded: %s" % batch_name) return batch except Batch.DoesNotExist, e: pass _logger.info("loading batch: %s" % batch_name) t0 = time() times = [] event = LoadBatchEvent(batch_name=batch_name, message="starting load") event.save() batch = None try: # build a Batch object for the batch location batch = self._get_batch(batch_name, batch_source, create=True) self._sanity_check_batch(batch) # stash it away for processing later on self.current_batch = batch # parse the batch.xml and load up each issue mets file doc = etree.parse(batch.validated_batch_url) for e in doc.xpath('ndnp:reel', namespaces=ns): reel_number = e.attrib['reelNumber'].strip() try: reel = models.Reel.objects.get(number=reel_number, batch=batch) except models.Reel.DoesNotExist, e: reel = models.Reel(number=reel_number, batch=batch) reel.save() for e in doc.xpath('ndnp:issue', namespaces=ns): mets_url = urlparse.urljoin(batch.storage_url, e.text) try: issue = self._load_issue(mets_url) except ValueError, e: _logger.exception(e) continue reset_queries() times.append((time() - t0, self.pages_processed))
def purge_batch(self, batch_name): event = LoadBatchEvent(batch_name=batch_name, message="starting purge") event.save() try: batch = self._get_batch(batch_name) self._purge_batch(batch) event = LoadBatchEvent(batch_name=batch_name, message="purged") event.save() # clean up symlinks if exists link_name = os.path.join(settings.BATCH_STORAGE, batch_name) if os.path.islink(link_name): _logger.info("Removing symlink %s", link_name) os.remove(link_name) except Exception, e: msg = "purge failed: %s" % e _logger.error(msg) _logger.exception(e) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() raise BatchLoaderException(msg)
mets_url = urlparse.urljoin(batch.storage_url, e.text) try: issue = self._load_issue(mets_url) except ValueError, e: _logger.exception(e) continue reset_queries() times.append((time() - t0, self.pages_processed)) # commit new changes to the solr index, if we are indexing if self.PROCESS_OCR: self.solr.commit() batch.save() msg = "processed %s pages" % batch.page_count event = LoadBatchEvent(batch_name=batch_name, message=msg) _logger.info(msg) event.save() # _chart(times) except Exception, e: msg = "unable to load batch: %s" % e _logger.error(msg) _logger.exception(e) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() try: self.purge_batch(batch_name) except Exception, pbe: _logger.error("purge batch failed for failed load batch: %s" % pbe)
def load_batch(self, batch_path, strict=True): """Load a batch, and return a Batch instance for the batch that was loaded. loader.load_batch('/path/to/batch_curiv_ahwahnee_ver01') """ LOGGER.info("loading batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path.rstrip("/")) if dirname: batch_source = None link_name = os.path.join(settings.BATCH_STORAGE, batch_name) if batch_path != link_name and not os.path.islink(link_name): LOGGER.info("creating symlink %s -> %s", batch_path, link_name) os.symlink(batch_path, link_name) else: batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) if not strict: try: batch = Batch.objects.get(name=batch_name) LOGGER.info("Batch already loaded: %s", batch_name) return batch except Batch.DoesNotExist as e: pass LOGGER.info("loading batch: %s", batch_name) event = LoadBatchEvent(batch_name=batch_name, message="starting load") event.save() batch = None try: # build a Batch object for the batch location batch = self._get_batch(batch_name, batch_source, create=True) self._sanity_check_batch(batch) # stash it away for processing later on self.current_batch = batch # parse the batch.xml and load up each issue mets file doc = etree.parse(batch.validated_batch_url) for e in doc.xpath('ndnp:reel', namespaces=ns): reel_number = e.attrib['reelNumber'].strip() try: reel = models.Reel.objects.get(number=reel_number, batch=batch) except models.Reel.DoesNotExist as e: reel = models.Reel(number=reel_number, batch=batch) reel.save() for e in doc.xpath('ndnp:issue', namespaces=ns): mets_url = urlparse.urljoin(batch.storage_url, e.text) try: issue, pages = self._load_issue(mets_url) except ValueError as e: LOGGER.exception(e) continue # commit new changes to the solr index, if we are indexing if self.PROCESS_OCR: LOGGER.info("Adding pages to solr index from issue %s", issue.title) for page in pages: LOGGER.debug("indexing ocr for: %s", page.url) self.solr.add(**page.solr_doc) page.indexed = True page.save() if self.PROCESS_OCR: LOGGER.info("Committing solr index") self.solr.commit() batch.save() msg = "processed %s pages" % batch.page_count LOGGER.info(msg) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() except Exception as e: msg = "unable to load batch: %s" % e LOGGER.exception(msg) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() try: self.purge_batch(batch_name) except Exception as pbe: LOGGER.error("purge batch failed for failed load batch: %s", pbe) LOGGER.exception(pbe) raise BatchLoaderException(msg) if settings.IS_PRODUCTION: batch.released = datetime.now() batch.save() return batch
def purge_batch(self, batch_name): batch_name = _normalize_batch_name(batch_name) try: batch = self._get_batch(batch_name) except Batch.DoesNotExist: LOGGER.info("Batch %s does not exist", batch_name) return event = LoadBatchEvent(batch_name=batch_name, message="starting purge") event.save() try: self._purge_batch(batch) event = LoadBatchEvent(batch_name=batch_name, message="purged") event.save() # clean up symlinks if exists link_name = os.path.join(settings.BATCH_STORAGE, batch_name) if os.path.islink(link_name): LOGGER.info("Removing symlink %s", link_name) os.remove(link_name) except Exception as e: msg = "purge failed: %s" % e LOGGER.exception(msg) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() raise BatchLoaderException(msg)