Exemplo n.º 1
0
    def test_ocr_dump(self):
        loader = BatchLoader()
        batch_dir = os.path.join(OcrDumpTests.batchDir, "batch_oru_testbatch_ver01")
        batch = loader.load_batch(batch_dir)
        self.assertEqual(batch.page_count, 27)

        t0 = datetime.datetime.now()

        dump = OcrDump.new_from_batch(batch)
        self.assertEqual(dump.name, "batch_oru_testbatch_ver01.tar.bz2")
        self.assertEqual(dump.path, os.path.join(OcrDumpTests.dumpDir, "batch_oru_testbatch_ver01.tar.bz2"))

        # make sure the sha1 looks good
        sha1 = hashlib.sha1()
        fh = open(dump.path)
        buff = fh.read()
        sha1.update(buff)
        self.assertEqual(dump.sha1, sha1.hexdigest())

        # make sure there are the right number of things in the dump
        t = tarfile.open(dump.path, "r:bz2")
        members = t.getmembers()
        self.assertEqual(len(members), 27 * 2) # ocr xml and txt for each page
        self.assertEqual(members[0].size, 19)

        # mtime on files in the archive should be just after we
        # created the OcrDump object from the batch
        t1 = datetime.datetime.fromtimestamp(members[0].mtime)
        self.assertTrue(t1 - t0 < datetime.timedelta(seconds=2))

        # Make sure the batch is gone - mysql gets purged between tests, but
        # solr does not.  This can't be done in teardown since the mysql db
        # is purged :(
        loader = BatchLoader()
        loader.purge_batch('batch_oru_testbatch_ver01')
Exemplo n.º 2
0
def dump_ocr(batch_name):
    batch = Batch.objects.get(name=batch_name)
    try:
        if batch.ocr_dump:
            logger.info("ocr already generated for %s", batch)
        return
    except OcrDump.DoesNotExist:
        # as expected
        pass

    logger.info("starting to dump ocr for %s", batch)
    dump = OcrDump.new_from_batch(batch)
    logger.info("created ocr dump %s for %s", dump, batch)