if args.v: print 'error reading scandata file: ', scandata_file continue scandata_pages = scandata.find('pageData').findall('page') if args.v: print 'Loaded', len( scandata_pages), ' scandata pages in', clock() - t, 's' ia_page_index = 0 scandata_index = 0 for page in scandata_pages: if (skipScanDataPage(page)): scandata_index += 1 continue # db_item = collection.find_one({'scan_id': row[0], 'ia_page_num': ia_page_index}) # if (db_item is None): info = { 'scan_id': row[0], 'scandata_index': scandata_index, 'ia_page_num': ia_page_index, 'page_num': ia_page_index + 1, 'leaf_num': page.get('leafNum'), 'has_illustration': { 'gold_standard': False
scan_file_string = scan_file_string.replace('xmlns="http://archive.org/scribe/xml"', '') scandata = ET.fromstring(scan_file_string) except: if args.v: print 'error reading scandata file: ', scandata_file continue scandata_pages = scandata.find('pageData').findall('page') if args.v: print 'Loaded', len(scandata_pages), ' scandata pages in', clock() - t, 's' ia_page_index = 0 scandata_index = 0 while(scandata_index < len(scandata_pages) and skipScanDataPage(scandata_pages[scandata_index])): if args.v: print 'Skipping ', scandata_index scandata_index += 1 db_item = collection.find_one({'scan_id': row[0], 'ia_page_num': ia_page_index}) if (db_item is None): info = { 'scan_id': row[0], 'scandata_index': scandata_index, 'ia_page_num': ia_page_index, 'page_num': ia_page_index + 1, 'leaf_num': scandata_pages[scandata_index].get('leafNum') if (scandata_index < len(scandata_pages)) else '', 'has_illustration': {
print 'parsing scandata' t = clock() scandata = ET.parse(scandata_file) scandata_pages = scandata.find('pageData').findall('page') print 'found', len(scandata_pages), 'pages from scan data in', clock() - t, 's' print 'parsing abbyy' t = clock() abbyy = ET.parse(gzip.open(abbyy_file)) abbyy_pages = abbyy.findall('{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}page') print 'found', len(abbyy_pages), 'pages from abbyy data in', clock() - t, 's' ia_page_index = 0 scandata_index = 0 while(skipScanDataPage(scandata_pages[scandata_index])): scandata_index += 1 result = processPage( scan_id, ia_page_index, scandata_pages[scandata_index], abbyy_pages[scandata_index], False ) output_writer.writerow([ row[0], row[3], result['abbyy_processing'], result['n_picture_blocks'],
except: if args.v: print 'error reading scandata file: ', scandata_file continue scandata_pages = scandata.find('pageData').findall('page') if args.v: print 'Loaded', len(scandata_pages), ' scandata pages in', clock() - t, 's' ia_page_index = 0 scandata_index = 0 for page in scandata_pages: if (skipScanDataPage(page)): scandata_index += 1 continue # db_item = collection.find_one({'scan_id': row[0], 'ia_page_num': ia_page_index}) # if (db_item is None): info = { 'scan_id': row[0], 'scandata_index': scandata_index, 'ia_page_num': ia_page_index, 'page_num': ia_page_index + 1, 'leaf_num': page.get('leafNum'), 'has_illustration': { 'gold_standard': False
print "parsing scandata" t = clock() scandata = ET.parse(scandata_file) scandata_pages = scandata.find("pageData").findall("page") print "found", len(scandata_pages), "pages from scan data in", clock() - t, "s" print "parsing abbyy" t = clock() abbyy = ET.parse(gzip.open(abbyy_file)) abbyy_pages = abbyy.findall("{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}page") print "found", len(abbyy_pages), "pages from abbyy data in", clock() - t, "s" ia_page_index = 0 scandata_index = 0 while skipScanDataPage(scandata_pages[scandata_index]): scandata_index += 1 result = processPage(scan_id, ia_page_index, scandata_pages[scandata_index], abbyy_pages[scandata_index], False) output_writer.writerow( [row[0], row[3], result["abbyy_processing"], result["n_picture_blocks"], result["coverage"]] ) ia_page_index += 1 scandata_index += 1 output_file.close() control_file.close() print "Finished in", (clock() - t0)
def runCSV(): import argparse import gzip from helpers import skipScanDataPage from xml.etree import cElementTree as ET import sys ap = argparse.ArgumentParser(description='picture block processing') ap.add_argument('scan', type=str, help='scan id') ap.add_argument('--page', type=int, help='page #', default=None) ap.add_argument('--render', type=bool, help='render blocks', default=False) args = ap.parse_args() #scan_id = 'hallofshells00hard' scan_id = args.scan abbyy_file = 'scandata/%s/%s_abbyy.gz' % (scan_id, scan_id) scandata_file = 'scandata/%s/%s_scandata.xml' % (scan_id, scan_id) print 'parsing scandata' t = clock() scandata = ET.parse(scandata_file) scandata_pages = scandata.find('pageData').findall('page') print 'found', len(scandata_pages), 'pages from scan data in', clock() - t, 's' print 'parsing abbyy' t = clock() abbyy = ET.parse(gzip.open(abbyy_file)) abbyy_pages = abbyy.findall('{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}page') print 'found', len(abbyy_pages), 'pages from abbyy data in', clock() - t, 's' results = [] ia_page_index = 0 for i in range(0, len(scandata_pages)): if skipScanDataPage(scandata_pages[i]): continue if (args.page == None): # process all pages results.append(processPage( scan_id, ia_page_index, scandata_pages[i], abbyy_pages[i], args.render )) elif (i == args.page): break ia_page_index += 1 if (args.page != None): print processPage(scan_id, ia_page_index, scandata_pages[i], abbyy_pages[i], args.render) sys.exit() import csv output_filename = 'output/pictureblocks/%s-pictureblocks.csv' % scan_id if not os.path.exists(os.path.dirname(output_filename)): os.mkdir(os.path.dirname(output_filename)) output_file = open(output_filename, 'w') writer = csv.writer(output_file) writer.writerow([ 'IA page', 'Image detected', 'Processing time', '# of picture blocks', '% coverage', 'intersection' ]) for p in range(0, len(results)): #print p writer.writerow([ p, results[p]['image_detected'], results[p]['abbyy_processing'], results[p]['n_picture_blocks'], results[p]['coverage'], results[p]['blocks_intersect'] ]) if (results[p]['image_detected']): print 'Image detected on page', p output_file.close() if (args.render): print 'Avg image processing time:', average(benchmarks['image_processing']), 's'
def runCSV(): import argparse import gzip from helpers import skipScanDataPage from xml.etree import cElementTree as ET import sys ap = argparse.ArgumentParser(description='picture block processing') ap.add_argument('scan', type=str, help='scan id') ap.add_argument('--page', type=int, help='page #', default=None) ap.add_argument('--render', type=bool, help='render blocks', default=False) args = ap.parse_args() #scan_id = 'hallofshells00hard' scan_id = args.scan abbyy_file = 'scandata/%s/%s_abbyy.gz' % (scan_id, scan_id) scandata_file = 'scandata/%s/%s_scandata.xml' % (scan_id, scan_id) print 'parsing scandata' t = clock() scandata = ET.parse(scandata_file) scandata_pages = scandata.find('pageData').findall('page') print 'found', len( scandata_pages), 'pages from scan data in', clock() - t, 's' print 'parsing abbyy' t = clock() abbyy = ET.parse(gzip.open(abbyy_file)) abbyy_pages = abbyy.findall( '{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}page') print 'found', len( abbyy_pages), 'pages from abbyy data in', clock() - t, 's' results = [] ia_page_index = 0 for i in range(0, len(scandata_pages)): if skipScanDataPage(scandata_pages[i]): continue if (args.page == None): # process all pages results.append( processPage(scan_id, ia_page_index, scandata_pages[i], abbyy_pages[i], args.render)) elif (i == args.page): break ia_page_index += 1 if (args.page != None): print processPage(scan_id, ia_page_index, scandata_pages[i], abbyy_pages[i], args.render) sys.exit() import csv output_filename = 'output/pictureblocks/%s-pictureblocks.csv' % scan_id if not os.path.exists(os.path.dirname(output_filename)): os.mkdir(os.path.dirname(output_filename)) output_file = open(output_filename, 'w') writer = csv.writer(output_file) writer.writerow([ 'IA page', 'Image detected', 'Processing time', '# of picture blocks', '% coverage', 'intersection' ]) for p in range(0, len(results)): #print p writer.writerow([ p, results[p]['image_detected'], results[p]['abbyy_processing'], results[p]['n_picture_blocks'], results[p]['coverage'], results[p]['blocks_intersect'] ]) if (results[p]['image_detected']): print 'Image detected on page', p output_file.close() if (args.render): print 'Avg image processing time:', average( benchmarks['image_processing']), 's'