Пример #1
0
    def purge_batch(self, batch_name):
        event = LoadBatchEvent(batch_name=batch_name, message="starting purge")
        event.save()

        try:
            batch = self._get_batch(batch_name)
            self._purge_batch(batch)
            event = LoadBatchEvent(batch_name=batch_name, message="purged")
            event.save()
            # clean up symlinks if exists
            link_name = os.path.join(settings.BATCH_STORAGE, batch_name)
            if os.path.islink(link_name):
                LOGGER.info("Removing symlink %s", link_name)
                os.remove(link_name)
        except Exception as e:
            msg = "purge failed: %s" % e
            LOGGER.exception(msg)
            event = LoadBatchEvent(batch_name=batch_name, message=msg)
            event.save()
            raise BatchLoaderException(msg)
Пример #2
0
                batch_source += "/"

        batch_name = _normalize_batch_name(batch_name)
        if not strict:
            try:
                batch = Batch.objects.get(name=batch_name)
                _logger.info("Batch already loaded: %s" % batch_name)
                return batch
            except Batch.DoesNotExist, e:
                pass

        _logger.info("loading batch: %s" % batch_name)
        t0 = time()
        times = []

        event = LoadBatchEvent(batch_name=batch_name, message="starting load")
        event.save()

        batch = None
        try:
            # build a Batch object for the batch location
            batch = self._get_batch(batch_name, batch_source, create=True)
            self._sanity_check_batch(batch)

            # stash it away for processing later on
            self.current_batch = batch

            # parse the batch.xml and load up each issue mets file
            doc = etree.parse(batch.validated_batch_url)

            for e in doc.xpath('ndnp:reel', namespaces=ns):
Пример #3
0
class BatchLoader(object):
    """This class allows you to load a batch into the database. A loader
    object serves as a context for a particular batch loading job.
    """
    def __init__(self,
                 process_ocr=True,
                 process_coordinates=True,
                 in_copyright=False):
        """Create a BatchLoader.
        The process_ocr parameter is used (mainly in testing) when we don't
        want to spend time actually extracting ocr text and indexing.
        """
        self.PROCESS_OCR = process_ocr
        if self.PROCESS_OCR:
            self.solr = SolrConnection(settings.SOLR)
        self.PROCESS_COORDINATES = process_coordinates
        self.IN_COPYRIGHT = in_copyright

    def _find_batch_file(self, batch):
        # look for batch_1.xml, BATCH_1.xml, etc
        for alias in [
                "batch_1.xml", "BATCH_1.xml", "batchfile_1.xml", "batch_2.xml",
                "BATCH_2.xml", "batch.xml"
        ]:
            if os.path.isfile(os.path.join(batch.storage_url, alias)):
                return alias
            else:
                continue
        else:
            raise BatchLoaderException(
                "could not find batch_1.xml (or any of its aliases) in '%s' -- has the batch been validated?"
                % batch.path)

    def _sanity_check_batch(self, batch):
        if not os.path.exists(batch.path):
            raise BatchLoaderException("batch does not exist at %s" %
                                       batch.path)
        # b = urllib2.urlopen(batch.url)
        batch.validated_batch_file = self._find_batch_file(batch)

    def load_batch(self, batch_path, strict=True):
        """Load a batch, and return a Batch instance for the batch
        that was loaded.
          loader.load_batch('/path/to/batch_curiv_ahwahnee_ver01')
        """
        self.pages_processed = 0

        logging.info("loading batch at %s", batch_path)
        dirname, batch_name = os.path.split(batch_path.rstrip("/"))
        if dirname:
            batch_source = None
            link_name = os.path.join(settings.BATCH_STORAGE, batch_name)
            if batch_path != link_name and not os.path.islink(link_name):
                _logger.info("creating symlink %s -> %s", batch_path,
                             link_name)
                os.symlink(batch_path, link_name)
        else:
            batch_source = os.path.join(settings.BATCH_STORAGE, batch_name)
            if not batch_source.endswith("/"):
                batch_source += "/"

        batch_name = _normalize_batch_name(batch_name)
        if not strict:
            try:
                batch = Batch.objects.get(name=batch_name)
                _logger.info("Batch already loaded: %s" % batch_name)
                return batch
            except Batch.DoesNotExist, e:
                pass

        _logger.info("loading batch: %s" % batch_name)
        t0 = time()
        times = []

        event = LoadBatchEvent(batch_name=batch_name, message="starting load")
        event.save()

        batch = None
        try:
            # build a Batch object for the batch location
            batch = self._get_batch(batch_name, batch_source, create=True)
            self._sanity_check_batch(batch)

            # stash it away for processing later on
            self.current_batch = batch

            # parse the batch.xml and load up each issue mets file
            doc = etree.parse(batch.validated_batch_url)

            for e in doc.xpath('ndnp:reel', namespaces=ns):

                reel_number = e.attrib['reelNumber'].strip()

                try:
                    reel = models.Reel.objects.get(number=reel_number,
                                                   batch=batch)
                except models.Reel.DoesNotExist, e:
                    reel = models.Reel(number=reel_number, batch=batch)
                    reel.save()

            for e in doc.xpath('ndnp:issue', namespaces=ns):
                mets_url = urlparse.urljoin(batch.storage_url, e.text)
                try:
                    issue = self._load_issue(mets_url)
                except ValueError, e:
                    _logger.exception(e)
                    continue
                reset_queries()
                times.append((time() - t0, self.pages_processed))
Пример #4
0
    def purge_batch(self, batch_name):
        event = LoadBatchEvent(batch_name=batch_name, message="starting purge")
        event.save()

        try:
            batch = self._get_batch(batch_name)
            self._purge_batch(batch)
            event = LoadBatchEvent(batch_name=batch_name, message="purged")
            event.save()
            # clean up symlinks if exists
            link_name = os.path.join(settings.BATCH_STORAGE, batch_name)
            if os.path.islink(link_name):
                _logger.info("Removing symlink %s", link_name)
                os.remove(link_name)
        except Exception, e:
            msg = "purge failed: %s" % e
            _logger.error(msg)
            _logger.exception(e)
            event = LoadBatchEvent(batch_name=batch_name, message=msg)
            event.save()
            raise BatchLoaderException(msg)
Пример #5
0
                mets_url = urlparse.urljoin(batch.storage_url, e.text)
                try:
                    issue = self._load_issue(mets_url)
                except ValueError, e:
                    _logger.exception(e)
                    continue
                reset_queries()
                times.append((time() - t0, self.pages_processed))

            # commit new changes to the solr index, if we are indexing
            if self.PROCESS_OCR:
                self.solr.commit()

            batch.save()
            msg = "processed %s pages" % batch.page_count
            event = LoadBatchEvent(batch_name=batch_name, message=msg)
            _logger.info(msg)
            event.save()

            # _chart(times)
        except Exception, e:
            msg = "unable to load batch: %s" % e
            _logger.error(msg)
            _logger.exception(e)
            event = LoadBatchEvent(batch_name=batch_name, message=msg)
            event.save()
            try:
                self.purge_batch(batch_name)
            except Exception, pbe:
                _logger.error("purge batch failed for failed load batch: %s" %
                              pbe)
Пример #6
0
                batch_source += "/"

        batch_name = _normalize_batch_name(batch_name)
        if not strict:
            try:
                batch = Batch.objects.get(name=batch_name)
                _logger.info("Batch already loaded: %s" % batch_name)
                return batch
            except Batch.DoesNotExist, e:
                pass

        _logger.info("loading batch: %s" % batch_name)
        t0 = time()
        times = []

        event = LoadBatchEvent(batch_name=batch_name, message="starting load")
        event.save()

        batch = None
        try:
            # build a Batch object for the batch location
            batch = self._get_batch(batch_name, batch_source, create=True)
            self._sanity_check_batch(batch)

            # stash it away for processing later on
            self.current_batch = batch

            # parse the batch.xml and load up each issue mets file
            doc = etree.parse(batch.validated_batch_url)

            for e in doc.xpath('ndnp:reel', namespaces=ns):
Пример #7
0
    def load_batch(self, batch_path, strict=True):
        """Load a batch, and return a Batch instance for the batch
        that was loaded.

          loader.load_batch('/path/to/batch_curiv_ahwahnee_ver01')

        """

        LOGGER.info("loading batch at %s", batch_path)
        dirname, batch_name = os.path.split(batch_path.rstrip("/"))
        if dirname:
            batch_source = None
            link_name = os.path.join(settings.BATCH_STORAGE, batch_name)
            if batch_path != link_name and not os.path.islink(link_name):
                LOGGER.info("creating symlink %s -> %s", batch_path, link_name)
                os.symlink(batch_path, link_name)
        else:
            batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name)
            if not batch_source.endswith("/"):
                batch_source += "/"

        batch_name = _normalize_batch_name(batch_name)
        if not strict:
            try:
                batch = Batch.objects.get(name=batch_name)
                LOGGER.info("Batch already loaded: %s", batch_name)
                return batch
            except Batch.DoesNotExist as e:
                pass

        LOGGER.info("loading batch: %s", batch_name)

        event = LoadBatchEvent(batch_name=batch_name, message="starting load")
        event.save()

        batch = None
        try:
            # build a Batch object for the batch location
            batch = self._get_batch(batch_name, batch_source, create=True)
            self._sanity_check_batch(batch)

            # stash it away for processing later on
            self.current_batch = batch

            # parse the batch.xml and load up each issue mets file
            doc = etree.parse(batch.validated_batch_url)

            for e in doc.xpath('ndnp:reel', namespaces=ns):

                reel_number = e.attrib['reelNumber'].strip()

                try:
                    reel = models.Reel.objects.get(number=reel_number,
                                                   batch=batch)
                except models.Reel.DoesNotExist as e:
                    reel = models.Reel(number=reel_number, batch=batch)
                    reel.save()

            for e in doc.xpath('ndnp:issue', namespaces=ns):
                mets_url = urlparse.urljoin(batch.storage_url, e.text)

                try:
                    issue, pages = self._load_issue(mets_url)
                except ValueError as e:
                    LOGGER.exception(e)
                    continue

                # commit new changes to the solr index, if we are indexing
                if self.PROCESS_OCR:
                    LOGGER.info("Adding pages to solr index from issue %s", issue.title)
                    for page in pages:
                        LOGGER.debug("indexing ocr for: %s", page.url)
                        self.solr.add(**page.solr_doc)
                        page.indexed = True
                        page.save()

            if self.PROCESS_OCR:
                LOGGER.info("Committing solr index")
                self.solr.commit()

            batch.save()
            msg = "processed %s pages" % batch.page_count
            LOGGER.info(msg)
            event = LoadBatchEvent(batch_name=batch_name, message=msg)
            event.save()
        except Exception as e:
            msg = "unable to load batch: %s" % e
            LOGGER.exception(msg)
            event = LoadBatchEvent(batch_name=batch_name, message=msg)
            event.save()
            try:
                self.purge_batch(batch_name)
            except Exception as pbe:
                LOGGER.error("purge batch failed for failed load batch: %s", pbe)
                LOGGER.exception(pbe)
            raise BatchLoaderException(msg)

        if settings.IS_PRODUCTION:
            batch.released = datetime.now()
            batch.save()

        return batch
Пример #8
0
    def purge_batch(self, batch_name):
        batch_name = _normalize_batch_name(batch_name)

        try:
            batch = self._get_batch(batch_name)
        except Batch.DoesNotExist:
            LOGGER.info("Batch %s does not exist", batch_name)
            return

        event = LoadBatchEvent(batch_name=batch_name, message="starting purge")
        event.save()

        try:
            self._purge_batch(batch)
            event = LoadBatchEvent(batch_name=batch_name, message="purged")
            event.save()
            # clean up symlinks if exists
            link_name = os.path.join(settings.BATCH_STORAGE, batch_name)
            if os.path.islink(link_name):
                LOGGER.info("Removing symlink %s", link_name)
                os.remove(link_name)
        except Exception as e:
            msg = "purge failed: %s" % e
            LOGGER.exception(msg)
            event = LoadBatchEvent(batch_name=batch_name, message=msg)
            event.save()
            raise BatchLoaderException(msg)