def process_coordinates(self, batch_path): logging.info("process word coordinates for batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path.rstrip("/")) if dirname: batch_source = None else: batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) try: batch = self._get_batch(batch_name, batch_source, create=False) self.current_batch = batch for issue in batch.issues.all(): for page in issue.pages.all(): url = urlparse.urljoin(self.current_batch.storage_url, page.ocr_filename) lang_text, coords = ocr_extractor(url) self._process_coordinates(page, coords) except Exception, e: msg = "unable to process coordinates for batch: %s" % e _logger.error(msg) _logger.exception(e) raise BatchLoaderException(msg)
def process_coordinates(self, batch_path): logging.info("process word coordinates for batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path.rstrip("/")) if dirname: batch_source = None else: batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) try: batch = self._get_batch(batch_name, batch_source, create=False) self.current_batch = batch for issue in batch.issues.all(): for page in issue.pages.all(): if not page.ocr_filename: logging.warn( "Batch [%s] has page [%s] that has no OCR. Skipping processing coordinates for page." % (batch_name, page)) else: url = urlparse.urljoin(self.current_batch.storage_url, page.ocr_filename) logging.debug("Extracting OCR from url %s" % url) lang_text, coords = ocr_extractor(url) self._process_coordinates(page, coords) except Exception, e: msg = "unable to process coordinates for batch: %s" % e _logger.error(msg) _logger.exception(e) raise BatchLoaderException(msg)
def process_ocr(self, page, index=True): _logger.debug("extracting ocr text and word coords for %s" % page.url) url = urlparse.urljoin(self.current_batch.storage_url, page.ocr_filename) lang_text, coords = ocr_extractor(url) if self.PROCESS_COORDINATES: self._process_coordinates(page, coords) ocr = OCR() ocr.page = page ocr.save() for lang, text in lang_text.iteritems(): try: language = models.Language.objects.get(Q(code=lang) | Q(lingvoj__iendswith=lang)) except models.Language.DoesNotExist: # default to english as per requirement language = models.Language.objects.get(code='eng') ocr.language_texts.create(language=language, text=text) page.ocr = ocr if index: _logger.debug("indexing ocr for: %s" % page.url) self.solr.add(**page.solr_doc) page.indexed = True page.save()
def process_ocr(self, page, index=True): _logger.debug("extracting ocr text and word coords for %s" % page.url) url = urlparse.urljoin(self.current_batch.storage_url, page.ocr_filename) lang_text, coords = ocr_extractor(url) if self.PROCESS_COORDINATES: self._process_coordinates(page, coords) ocr = OCR() ocr.page = page ocr.save() for lang, text in lang_text.iteritems(): try: language = models.Language.objects.get( Q(code=lang) | Q(lingvoj__iendswith=lang)) except models.Language.DoesNotExist: # default to english as per requirement language = models.Language.objects.get(code='eng') ocr.language_texts.create(language=language, text=text) page.ocr = ocr if index: _logger.debug("indexing ocr for: %s" % page.url) self.solr.add(**page.solr_doc) page.indexed = True page.save()
def process_coordinates(self, batch_path): LOGGER.info("process word coordinates for batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path.rstrip("/")) if dirname: batch_source = None else: batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) try: batch = self._get_batch(batch_name, batch_source, create=False) self.current_batch = batch for issue in batch.issues.all(): for page in issue.pages.all(): if not page.ocr_filename: LOGGER.warn("Batch [%s] has page [%s] that has no OCR. Skipping processing coordinates for page." % (batch_name, page)) else: url = urlparse.urljoin(self.current_batch.storage_url, page.ocr_filename) LOGGER.debug("Extracting OCR from url %s", url) lang_text, coords = ocr_extractor(url) self._process_coordinates(page, coords) except Exception as e: msg = "unable to process coordinates for batch: %s" % e LOGGER.exception(msg) raise BatchLoaderException(msg)
def process_ocr(self, page): LOGGER.debug("extracting ocr text and word coords for %s", page.url) url = urlparse.urljoin(self.current_batch.storage_url, page.ocr_filename) lang_text, coords = ocr_extractor(url) if self.PROCESS_COORDINATES: self._process_coordinates(page, coords) ocr = OCR() ocr.page = page ocr.save() lang_text_solr = {} for lang, text in lang_text.iteritems(): try: language = models.Language.objects.get(Q(code=lang) | Q(lingvoj__iendswith=lang)) except models.Language.DoesNotExist: LOGGER.warn("Language %s does not exist in the database. Defaulting to English.", lang) # default to english as per requirement language = models.Language.objects.get(code='eng') ocr.language_texts.create(language=language) lang_text_solr[language.code] = text page.ocr = ocr page.lang_text = lang_text_solr page.save() return page
def test_extractor(self): dir = join(dirname(dirname(__file__)), 'test-data') ocr_file = join(dir, 'ocr.xml') text, coord_info = ocr_extractor(ocr_file) coords = coord_info["coords"] expected_text = {"eng": file(join(dir, 'ocr.txt')).read().decode('utf-8')} self.assertEqual(text, expected_text) self.assertEqual(len(coords.keys()), 2489) self.assertEqual(len(coords['place']), 3)
def test_extractor(self): dir = join(dirname(dirname(__file__)), 'test-data') ocr_file = join(dir, 'ocr.xml') text, coord_info = ocr_extractor(ocr_file) coords = coord_info["coords"] expected_text = { "eng": file(join(dir, 'ocr.txt')).read().decode('utf-8') } self.assertEqual(text, expected_text) self.assertEqual(len(coords.keys()), 2489) self.assertEqual(len(coords['place']), 3)
def test_extractor(self): dir = join(dirname(dirname(__file__)), 'test-data') ocr_file = join(dir, 'ocr.xml') text, coord_info = ocr_extractor(ocr_file) coords = coord_info["coords"] expected_text = {"eng": file(join(dir, 'ocr.txt')).read().decode('utf-8')} self.assertEqual(text, expected_text) self.assertEqual(len(coords.keys()), 2150) self.assertEqual(len(coords['place']), 3) # Craft. should be normalized to Craft # since Solr's highlighting will not include # trailing punctuation in highlighted text self.assertTrue(coords.has_key('Craft')) self.assertTrue(not coords.has_key('Craft.'))
def test_extractor(self): dir = join(dirname(dirname(__file__)), 'test-data') ocr_file = join(dir, 'ocr.xml') text, coord_info = ocr_extractor(ocr_file) coords = coord_info["coords"] expected_text = { "eng": file(join(dir, 'ocr.txt')).read().decode('utf-8') } self.assertEqual(text, expected_text) self.assertEqual(len(coords.keys()), 2150) self.assertEqual(len(coords['place']), 3) # Craft. should be normalized to Craft # since Solr's highlighting will not include # trailing punctuation in highlighted text self.assertTrue(coords.has_key('Craft')) self.assertTrue(not coords.has_key('Craft.'))
def solr_doc(self): date = self.issue.date_issued date = "%4i%02i%02i" % (date.year, date.month, date.day) # start with basic title data doc = self.issue.title.solr_doc # no real need to repeat this stuff in pages del doc["essay"] del doc["url"] del doc["holding_type"] doc.update({ "id": self.url, "type": "page", "batch": self.issue.batch.name, "date": date, "page": self.number, "sequence": self.sequence, "section_label": self.section_label, "edition_label": self.issue.edition_label, }) # This is needed when building the solr index. # TODO this is also used when visiting a page like http://127.0.0.1:8000/search/pages/results/?state=&date1=1789&date2=1963&proxtext=&x=0&y=0&dateFilterType=yearRange&rows=20&searchType=basic&format=json # In that case we might want to break it from using this and pull directly from SOLR for performance reasons # However, when ingesting a batch, ocr_abs_filename may not be set ocr_texts = self.lang_text if self.ocr_abs_filename is not None: logging.debug("extracting ocr for solr page") ocr_texts, _ = ocr_extractor(self.ocr_abs_filename) for lang, ocr_text in ocr_texts.items(): # make sure Solr is configured to handle the language and if it's # not just treat it as English if lang not in settings.SOLR_LANGUAGES: lang = "eng" doc["ocr_%s" % lang] = ocr_text return doc
def solr_doc(self): date = self.issue.date_issued date = "%4i%02i%02i" % (date.year, date.month, date.day) # start with basic title data doc = self.issue.title.solr_doc # no real need to repeat this stuff in pages del doc['essay'] del doc['url'] del doc['holding_type'] doc.update({ 'id': self.url, 'type': 'page', 'batch': self.issue.batch.name, 'date': date, 'page': self.number, 'sequence': self.sequence, 'section_label': self.section_label, 'edition_label': self.issue.edition_label, }) # This is needed when building the solr index. # TODO this is also used when visiting a page like http://127.0.0.1:8000/search/pages/results/?state=&date1=1789&date2=1963&proxtext=&x=0&y=0&dateFilterType=yearRange&rows=20&searchType=basic&format=json # In that case we might want to break it from using this and pull directly from SOLR for performance reasons # However, when ingesting a batch, ocr_abs_filename may not be set ocr_texts = self.lang_text if self.ocr_abs_filename is not None: logging.debug("extracting ocr for solr page") ocr_texts, _ = ocr_extractor(self.ocr_abs_filename) for lang, ocr_text in ocr_texts.items(): # make sure Solr is configured to handle the language and if it's # not just treat it as English if lang not in settings.SOLR_LANGUAGES: lang = "eng" doc['ocr_%s' % lang] = ocr_text return doc