예제 #1
0
    def process_ocr(self, page, index=True):
        _logger.debug("extracting ocr text and word coords for %s" % page.url)

        url = urlparse.urljoin(self.current_batch.storage_url, page.ocr_filename)

        lang_text, coords = ocr_extractor(url)

        if self.PROCESS_COORDINATES:
            self._process_coordinates(page, coords)

        ocr = OCR()
        ocr.page = page
        ocr.save()
        for lang, text in lang_text.iteritems():
            try:
                language = models.Language.objects.get(Q(code=lang) | Q(lingvoj__iendswith=lang))
            except models.Language.DoesNotExist:
                # default to english as per requirement
                language = models.Language.objects.get(code="eng")
            ocr.language_texts.create(language=language, text=text)
        page.ocr = ocr
        if index:
            _logger.debug("indexing ocr for: %s" % page.url)
            self.solr.add(**page.solr_doc)
            page.indexed = True
        page.save()
예제 #2
0
    def test_extractor(self):
        dir = join(dirname(dirname(__file__)), 'test-data')
        ocr_file = join(dir, 'ocr.xml')
        text, coord_info = ocr_extractor(ocr_file)
        coords = coord_info["coords"]
        expected_text = {"eng": file(join(dir, 'ocr.txt')).read().decode('utf-8')}

        self.assertEqual(text, expected_text)
        self.assertEqual(len(coords.keys()), 2150)
        self.assertEqual(len(coords['place']), 3)
        # Craft. should be normalized to Craft
        # since Solr's highlighting will not include
        # trailing punctuation in highlighted text
        self.assertTrue(coords.has_key('Craft'))
        self.assertTrue(not coords.has_key('Craft.'))
예제 #3
0
    def process_coordinates(self, batch_path):
        logging.info("process word coordinates for batch at %s", batch_path)
        dirname, batch_name = os.path.split(batch_path.rstrip("/"))
        if dirname:
            batch_source = None
        else:
            batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name)
            if not batch_source.endswith("/"):
                batch_source += "/"
        batch_name = _normalize_batch_name(batch_name)
        try:
            batch = self._get_batch(batch_name, batch_source, create=False)
            self.current_batch = batch
            for issue in batch.issues.all():
                for page in issue.pages.all():
                    url = urlparse.urljoin(self.current_batch.storage_url, page.ocr_filename)

                    lang_text, coords = ocr_extractor(url)
                    self._process_coordinates(page, coords)
        except Exception, e:
            msg = "unable to process coordinates for batch: %s" % e
            _logger.error(msg)
            _logger.exception(e)
            raise BatchLoaderException(msg)