def process_ocr(self, page, index=True): _logger.debug("extracting ocr text and word coords for %s" % page.url) url = urlparse.urljoin(self.current_batch.storage_url, page.ocr_filename) lang_text, coords = ocr_extractor(url) if self.PROCESS_COORDINATES: self._process_coordinates(page, coords) ocr = OCR() ocr.page = page ocr.save() for lang, text in lang_text.iteritems(): try: language = models.Language.objects.get(Q(code=lang) | Q(lingvoj__iendswith=lang)) except models.Language.DoesNotExist: # default to english as per requirement language = models.Language.objects.get(code="eng") ocr.language_texts.create(language=language, text=text) page.ocr = ocr if index: _logger.debug("indexing ocr for: %s" % page.url) self.solr.add(**page.solr_doc) page.indexed = True page.save()
def test_extractor(self): dir = join(dirname(dirname(__file__)), 'test-data') ocr_file = join(dir, 'ocr.xml') text, coord_info = ocr_extractor(ocr_file) coords = coord_info["coords"] expected_text = {"eng": file(join(dir, 'ocr.txt')).read().decode('utf-8')} self.assertEqual(text, expected_text) self.assertEqual(len(coords.keys()), 2150) self.assertEqual(len(coords['place']), 3) # Craft. should be normalized to Craft # since Solr's highlighting will not include # trailing punctuation in highlighted text self.assertTrue(coords.has_key('Craft')) self.assertTrue(not coords.has_key('Craft.'))
def process_coordinates(self, batch_path): logging.info("process word coordinates for batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path.rstrip("/")) if dirname: batch_source = None else: batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) try: batch = self._get_batch(batch_name, batch_source, create=False) self.current_batch = batch for issue in batch.issues.all(): for page in issue.pages.all(): url = urlparse.urljoin(self.current_batch.storage_url, page.ocr_filename) lang_text, coords = ocr_extractor(url) self._process_coordinates(page, coords) except Exception, e: msg = "unable to process coordinates for batch: %s" % e _logger.error(msg) _logger.exception(e) raise BatchLoaderException(msg)