def test_ocr_dump(self): loader = BatchLoader() batch_dir = os.path.join(OcrDumpTests.batchDir, "batch_oru_testbatch_ver01") batch = loader.load_batch(batch_dir) self.assertEqual(batch.page_count, 27) t0 = datetime.datetime.now() dump = OcrDump.new_from_batch(batch) self.assertEqual(dump.name, "batch_oru_testbatch_ver01.tar.bz2") self.assertEqual(dump.path, os.path.join(OcrDumpTests.dumpDir, "batch_oru_testbatch_ver01.tar.bz2")) # make sure the sha1 looks good sha1 = hashlib.sha1() fh = open(dump.path) buff = fh.read() sha1.update(buff) self.assertEqual(dump.sha1, sha1.hexdigest()) # make sure there are the right number of things in the dump t = tarfile.open(dump.path, "r:bz2") members = t.getmembers() self.assertEqual(len(members), 27 * 2) # ocr xml and txt for each page self.assertEqual(members[0].size, 19) # mtime on files in the archive should be just after we # created the OcrDump object from the batch t1 = datetime.datetime.fromtimestamp(members[0].mtime) self.assertTrue(t1 - t0 < datetime.timedelta(seconds=2)) # Make sure the batch is gone - mysql gets purged between tests, but # solr does not. This can't be done in teardown since the mysql db # is purged :( loader = BatchLoader() loader.purge_batch('batch_oru_testbatch_ver01')
def handle(self, batch_name, *args, **options): if len(args)!=0: raise CommandError('Usage is load_batch %s' % self.args) loader = BatchLoader(process_ocr=options['process_ocr'], process_coordinates=options['process_coordinates']) try: batch = loader.load_batch(batch_name) except BatchLoaderException, e: LOGGER.exception(e) raise CommandError("unable to load batch. check the load_batch log for clues")
def handle(self, batch_name, *args, **options): if len(args) != 0: raise CommandError('Usage is load_batch %s' % self.args) loader = BatchLoader( process_ocr=options['process_ocr'], process_coordinates=options['process_coordinates']) try: batch = loader.load_batch(batch_name) except BatchLoaderException, e: LOGGER.exception(e) raise CommandError( "unable to load batch. check the load_batch log for clues")
def handle(self, batch_path, *args, **options): if len(args) != 0: raise CommandError('Usage: load_batch %s' % self.args) if not os.path.exists(batch_path): raise CommandError( 'Batch path does not exist: {}'.format(batch_path)) loader = BatchLoader( process_ocr=options['process_ocr'], process_coordinates=options['process_coordinates']) try: batch = loader.load_batch(batch_path) except BatchLoaderException as e: LOGGER.exception(e) raise CommandError("Batch load failed. See logs/load_batch_#.log")
def test_ocr_dump(self): loader = BatchLoader() batch_dir = os.path.join(OcrDumpTests.batchDir, "batch_oru_testbatch_ver01") batch = loader.load_batch(batch_dir) self.assertEqual(batch.page_count, 27) t0 = timezone.now() dump = OcrDump.new_from_batch(batch) self.assertEqual(dump.name, "batch_oru_testbatch_ver01.tar.bz2") self.assertEqual( dump.path, os.path.join(OcrDumpTests.dumpDir, "batch_oru_testbatch_ver01.tar.bz2")) # make sure the sha1 looks good sha1 = hashlib.sha1() fh = open(dump.path, "rb") buff = fh.read() sha1.update(buff) self.assertEqual(dump.sha1, sha1.hexdigest()) # make sure there are the right number of things in the dump t = tarfile.open(dump.path, "r:bz2") members = t.getmembers() self.assertEqual(len(members), 27 * 2) # ocr xml and txt for each page self.assertEqual(members[0].size, 19) # mtime on files in the archive should be just after we # created the OcrDump object from the batch t1 = datetime.datetime.fromtimestamp(members[0].mtime) t1 = timezone.make_aware(t1) self.assertTrue(t1 - t0 < datetime.timedelta(seconds=2)) # Make sure the batch is gone - mysql gets purged between tests, but # solr does not. This can't be done in teardown since the mysql db # is purged :( loader = BatchLoader() loader.purge_batch('batch_oru_testbatch_ver01')
def test_load_batch(self): # Extract mini-batch tarball to /tmp somewhere tarpath = os.path.join(os.path.dirname(core.__file__), 'test-data', 'testbatch.tgz') tar = tarfile.open(tarpath) tar.extractall(path = BatchLoaderTest.batchDir) tar.close() settings.BATCH_STORAGE = BatchLoaderTest.batchDir batch_dir = os.path.join(BatchLoaderTest.batchDir, "batch_oru_testbatch_ver01") loader = BatchLoader(process_ocr=False) batch = loader.load_batch(batch_dir) self.assertTrue(isinstance(batch, Batch)) self.assertEqual(batch.name, 'batch_oru_testbatch_ver01') self.assertEqual(len(batch.issues.all()), 4) title = Title.objects.get(lccn = 'sn83030214') self.assertTrue(title.has_issues) issue = batch.issues.all()[0] self.assertEqual(issue.volume, '1') self.assertEqual(issue.number, '1') self.assertEqual(issue.edition, 1) self.assertEqual(issue.title.lccn, 'sn83030214') self.assertEqual(issue.date_issued.strftime('%Y-%m-%d'), '1999-06-15') self.assertEqual(len(issue.pages.all()), 15) page = issue.pages.all()[0] self.assertEqual(page.sequence, 1) self.assertEqual(page.url, '/lccn/sn83030214/1999-06-15/ed-1/seq-1/') notes = page.notes.order_by("type").all() self.assertEqual(len(notes), 2) note = page.notes.all()[0] self.assertEqual(note.type, "noteAboutReproduction") self.assertEqual(note.text, "Present") note = page.notes.all()[1] self.assertEqual(note.type, "agencyResponsibleForReproduction") self.assertEqual(note.text, "oru") # Validate page 1's metadata self.assertEqual(page.sequence, 1) self.assertEqual(page.jp2_filename, 'sn83030214/print/1999061501/0001.jp2') self.assertEqual(page.jp2_length, 411) self.assertEqual(page.jp2_width, 411) self.assertEqual(page.ocr_filename, 'sn83030214/print/1999061501/0001.xml') self.assertEqual(page.pdf_filename, 'sn83030214/print/1999061501/0001.pdf') # extract ocr data just for this page loader.process_ocr(page, index=False) self.assertTrue(page.ocr != None) self.assertTrue(len(page.ocr.text) > 0) p = Title.objects.get(lccn='sn83030214').issues.all()[0].pages.all()[0] self.assertTrue(p.ocr != None) # check that the solr_doc looks legit solr_doc = page.solr_doc self.assertEqual(solr_doc['id'], '/lccn/sn83030214/1999-06-15/ed-1/seq-1/') self.assertEqual(solr_doc['type'], 'page') self.assertEqual(solr_doc['sequence'], 1) self.assertEqual(solr_doc['lccn'], 'sn83030214') self.assertEqual(solr_doc['title'], 'New-York tribune.') self.assertEqual(solr_doc['date'], '19990615') self.assertEqual(solr_doc['batch'], 'batch_oru_testbatch_ver01') self.assertEqual(solr_doc['subject'], [ 'New York (N.Y.)--Newspapers.', 'New York County (N.Y.)--Newspapers.']) self.assertEqual(solr_doc['place'], [ 'New York--Brooklyn--New York City', 'New York--Queens--New York City']) self.assertEqual(solr_doc['note'], [ "I'll take Manhattan", 'The Big Apple']) self.assertTrue('essay' not in solr_doc) self.assertEqual(solr_doc['ocr_eng'], 'LCCNsn83030214Page1') # purge the batch and make sure it's gone from the db loader.purge_batch('batch_oru_testbatch_ver01') self.assertEqual(Batch.objects.all().count(), 0) self.assertEqual(Title.objects.get(lccn='sn83030214').has_issues, False)
def test_load_batch(self): # Extract mini-batch tarball to /tmp somewhere tarpath = os.path.join(os.path.dirname(core.__file__), 'test-data', 'testbatch.tgz') tar = tarfile.open(tarpath) tar.extractall(path = BatchLoaderTest.batchDir) tar.close() settings.BATCH_STORAGE = BatchLoaderTest.batchDir batch_dir = os.path.join(BatchLoaderTest.batchDir, "batch_oru_testbatch_ver01") loader = BatchLoader(process_ocr=False) batch = loader.load_batch(batch_dir) self.assertTrue(isinstance(batch, Batch)) self.assertEqual(batch.name, 'batch_oru_testbatch_ver01') self.assertEqual(len(batch.issues.all()), 4) title = Title.objects.get(lccn = 'sn83030214') self.assertTrue(title.has_issues) issue = batch.issues.all()[0] self.assertEqual(issue.volume, '1') self.assertEqual(issue.number, '1') self.assertEqual(issue.edition, 1) self.assertEqual(issue.title.lccn, 'sn83030214') self.assertEqual(date.strftime(issue.date_issued, '%Y-%m-%d'), '1999-06-15') self.assertEqual(len(issue.pages.all()), 15) page = issue.pages.all()[0] self.assertEqual(page.sequence, 1) self.assertEqual(page.url, u'/lccn/sn83030214/1999-06-15/ed-1/seq-1/') note = page.notes.all()[1] self.assertEqual(note.type, "noteAboutReproduction") self.assertEqual(note.text, "Present") # Validate page 1's metadata self.assertEqual(page.sequence, 1) self.assertEqual(page.jp2_filename, 'sn83030214/print/1999061501/0001.jp2') self.assertEqual(page.jp2_length, 411) self.assertEqual(page.jp2_width, 411) self.assertEqual(page.ocr_filename, 'sn83030214/print/1999061501/0001.xml') self.assertEqual(page.pdf_filename, 'sn83030214/print/1999061501/0001.pdf') # extract ocr data just for this page loader.process_ocr(page, index=False) self.assertTrue(page.ocr != None) self.assertTrue(len(page.ocr.text) > 0) p = Title.objects.get(lccn='sn83030214').issues.all()[0].pages.all()[0] self.assertTrue(p.ocr != None) # check that the solr_doc looks legit solr_doc = page.solr_doc self.assertEqual(solr_doc['id'], '/lccn/sn83030214/1999-06-15/ed-1/seq-1/') self.assertEqual(solr_doc['type'], 'page') self.assertEqual(solr_doc['sequence'], 1) self.assertEqual(solr_doc['lccn'], 'sn83030214') self.assertEqual(solr_doc['title'], 'New-York tribune.') self.assertEqual(solr_doc['date'], '19990615') self.assertEqual(solr_doc['batch'], 'batch_oru_testbatch_ver01') self.assertEqual(solr_doc['subject'], [ u'New York (N.Y.)--Newspapers.', u'New York County (N.Y.)--Newspapers.']) self.assertEqual(solr_doc['place'], [ u'New York--Brooklyn--New York City', u'New York--Queens--New York City']) self.assertEqual(solr_doc['note'], [ u"I'll take Manhattan", u'The Big Apple']) self.assertTrue(not solr_doc.has_key('essay')) self.assertEqual(solr_doc['ocr_eng'], 'LCCNsn83030214Page1') # purge the batch and make sure it's gone from the db loader.purge_batch('batch_oru_testbatch_ver01') self.assertEqual(Batch.objects.all().count(), 0) self.assertEqual(Title.objects.get(lccn='sn83030214').has_issues, False)