def main(argv): import optparse parser = optparse.OptionParser(usage='usage: %prog [options]', version='%prog 0.1', description='make tocs') parser.add_option('--human', action='store_true', default=False, help='print some human-readable stuff') global opts opts, args = parser.parse_args(argv) doc = '' callback = None if len(args) == 4: (item_id, doc, path, callback) = args elif len(args) == 3: (item_id, doc, path) = args else: (book_id, ) = args path = book_id book_id = args[0] iabook = Book(book_id, doc, path) global scandata_ns scandata_ns = iabook.get_scandata_ns() if djvu: pages = iabook.get_pages_as_djvu() else: pages = iabook.get_pages_as_abbyy() pages = filter(pages) pages = annotate(pages) def clear_page(page): page.clear() windowed_pages = windowed_iterator(pages, 5, clear_page) pages = analyze(windowed_pages) toc_result = make_toc.make_toc(iabook, pages, hardcode_toc_pages, hardcode_nottoc_pages) toc_result['contents_leafnos'] = iabook.get_contents_indices() toc_result['readable'] = print_readable(toc_result['qdtoc']) if opts.human: for r in ('readable', 'comments', 'isok'): print r + ':' print toc_result[r] print else: if callback is not None: print '%s(' % callback print json.dumps(toc_result) if callback is not None: print ')' else: print json.dumps(toc_result, indent=4)
def main(argv): import optparse parser = optparse.OptionParser(usage='usage: %prog [options]', version='%prog 0.1', description='make tocs') parser.add_option('--human', action='store_true', default=False, help='print some human-readable stuff') global opts opts, args = parser.parse_args(argv) doc = '' callback = None if len(args) == 4: (item_id, doc, path, callback) = args elif len(args) == 3: (item_id, doc, path) = args else: (book_id,) = args path = book_id book_id = args[0] iabook = Book(book_id, doc, path) global scandata_ns scandata_ns = iabook.get_scandata_ns() if djvu: pages = iabook.get_pages_as_djvu() else: pages = iabook.get_pages_as_abbyy() pages = filter(pages) pages = annotate(pages) def clear_page(page): page.clear() windowed_pages = windowed_iterator(pages, 5, clear_page) pages = analyze(windowed_pages) toc_result = make_toc.make_toc(iabook, pages, hardcode_toc_pages, hardcode_nottoc_pages) toc_result['contents_leafnos'] = iabook.get_contents_indices() toc_result['readable'] = print_readable(toc_result['qdtoc']) if opts.human: for r in ('readable', 'comments', 'isok'): print r + ':' print toc_result[r] print else: if callback is not None: print '%s(' % callback print json.dumps(toc_result) if callback is not None: print ')' else: print json.dumps(toc_result, indent=4)
def local_monotonic_p(seq): w = windowed_iterator(seq, 1) acceptable = 1 for p in w: if p is 0: yield False diffs = 0 expected = 0 for n in w.neighbors(1): if n is not 0: diffs += abs(p - n) expected += 1 if expected > 0 and diffs <= expected + acceptable: yield True else: yield False
def main(args): path = args[0] pageNum = int(args[1]) callback = args[2] if not re.match('^/\d{1,2}/items/.+_djvu.xml$', path): sys.exit(-1) if ('ttsNextPageCB' != callback): callback = 'ttsStartCB' f = open(path) context = etree.iterparse(f, tag='OBJECT') def drop_event(iter): for event, page in iter: yield page pages = drop_event(context) def clear_page(page): page.clear() pages = windowed_iterator(pages, windowsize, clear_page) for i, page in enumerate(pages): if i == pageNum: break hfs = guess_hfs(page, pages) lines = page.findall('.//LINE') #print 'got %s .//lines' % len(lines) textBlocks = [] block = '' rects = [] numWords = 0 for line in lines: # skip headers/footers if line in hfs: continue top = sys.maxint left = sys.maxint right = -1 bottom = -1 numWordsInLine = 0 words = line.findall('.//WORD') #print 'at start of line, rects =' #print rects for word in words: numWordsInLine += 1 text = word.text #print 'got text ' + text coords = word.get('coords').split(',') #l,b,r,t coords = map(int, coords) if int(coords[0]) < left: left = coords[0] if coords[1] > bottom: bottom = coords[1] if coords[2] > right: right = coords[2] if coords[3] < top: top = coords[3] block += word.text + ' ' numWords += 1 if text.endswith('.') and (numWords > minWordsInBlock): #print 'end of block with numWords=%d' % numWords #print 'block = ' + block rects.append([left, bottom, right, top]) #textBlocks.append(block.strip()) rects.insert(0, block.strip()) textBlocks.append(rects) block = '' rects = [] numWords = 0 numWordsInLine = 0 top = sys.maxint left = sys.maxint right = -1 bottom = -1 #end of line if numWordsInLine > 0: rects.append([left, bottom, right, top]) if numWords > maxWordsInBlock: #textBlocks.append(block.strip()) rects.insert(0, block.strip()) textBlocks.append(rects) block = '' numWords = 0 rects = [] #print 'at end of line, rects =' #print rects if '' != block: #textBlocks.append(block.strip()) rects.insert(0, block.strip()) textBlocks.append(rects) print 'br.%s(%s);' % (callback, json.dumps(textBlocks))
def main(argv): import optparse parser = optparse.OptionParser(usage='usage: %prog [options]', version='%prog 0.1', description='make tocs') parser.add_option('--in_deriver', action='store_true', default=False, help='match deriver-style args') parser.add_option('--simpletoc', action='store_true', default=False, help='do a simple toc analysis') parser.add_option('--human', action='store_true', default=False, help='print some human-readable stuff') global opts opts, args = parser.parse_args(argv) if opts.in_deriver: (dvju_xml_path, scandata_path) = args iabook = DeriverBook(dvju_xml_path, scandata_path) else: doc = '' callback = None if len(args) == 4: (item_id, doc, path, callback) = args elif len(args) == 3: (item_id, doc, path) = args else: (book_id, ) = args doc = path = book_id book_id = args[0] iabook = Book(book_id, doc, path) global scandata_ns scandata_ns = iabook.get_scandata_ns() if djvu: pages = iabook.get_pages_as_djvu() else: pages = iabook.get_pages_as_abbyy() pages = filter(pages) pages = annotate(pages) def clear_page(page): page.clear() windowed_pages = windowed_iterator(pages, 5, clear_page) pages = analyze(windowed_pages) if opts.simpletoc: toc_result = make_toc.simple_make_toc(iabook, pages) else: toc_result = make_toc.make_toc(iabook, pages, hardcode_toc_pages, hardcode_nottoc_pages) # toc_result['readable'] = print_readable(toc_result['qdtoc']) if opts.simpletoc: xml = toc_to_xml.make_xml(toc_result['qdtoc_tuples'], version) print etree.tostring(xml, pretty_print=True), # pretty_print adds nl elif opts.human: for r in ('readable', 'comments', 'isok'): print r + ':' print toc_result[r] print else: if callback is not None: print '%s(' % callback print json.dumps(toc_result) # print_one_per_line(qdtoc) if callback is not None: print ')' else: print json.dumps(toc_result, indent=4)
def main(args): path = args[0] pageNum = int(args[1]) callback = args[2] if not re.match("^/\d{1,2}/items/.+_djvu.xml$", path): sys.exit(-1) if "ttsNextPageCB" != callback: callback = "ttsStartCB" f = open(path) context = etree.iterparse(f, tag="OBJECT") def drop_event(iter): for event, page in iter: yield page pages = drop_event(context) def clear_page(page): page.clear() pages = windowed_iterator(pages, windowsize, clear_page) for i, page in enumerate(pages): if i == pageNum: break hfs = guess_hfs(page, pages) lines = page.findall(".//LINE") # print 'got %s .//lines' % len(lines) textBlocks = [] block = "" rects = [] numWords = 0 for line in lines: # skip headers/footers if line in hfs: continue top = sys.maxint left = sys.maxint right = -1 bottom = -1 numWordsInLine = 0 words = line.findall(".//WORD") # print 'at start of line, rects =' # print rects for word in words: numWordsInLine += 1 text = word.text # print 'got text ' + text coords = word.get("coords").split(",") # l,b,r,t coords = map(int, coords) if int(coords[0]) < left: left = coords[0] if coords[1] > bottom: bottom = coords[1] if coords[2] > right: right = coords[2] if coords[3] < top: top = coords[3] block += word.text + " " numWords += 1 if text.endswith(".") and (numWords > minWordsInBlock): # print 'end of block with numWords=%d' % numWords # print 'block = ' + block rects.append([left, bottom, right, top]) # textBlocks.append(block.strip()) rects.insert(0, block.strip()) textBlocks.append(rects) block = "" rects = [] numWords = 0 numWordsInLine = 0 top = sys.maxint left = sys.maxint right = -1 bottom = -1 # end of line if numWordsInLine > 0: rects.append([left, bottom, right, top]) if numWords > maxWordsInBlock: # textBlocks.append(block.strip()) rects.insert(0, block.strip()) textBlocks.append(rects) block = "" numWords = 0 rects = [] # print 'at end of line, rects =' # print rects if "" != block: # textBlocks.append(block.strip()) rects.insert(0, block.strip()) textBlocks.append(rects) print "br.%s(%s);" % (callback, json.dumps(textBlocks))
def main(argv): import optparse parser = optparse.OptionParser(usage='usage: %prog [options]', version='%prog 0.1', description='make tocs') parser.add_option('--in_deriver', action='store_true', default=False, help='match deriver-style args') parser.add_option('--simpletoc', action='store_true', default=False, help='do a simple toc analysis') parser.add_option('--human', action='store_true', default=False, help='print some human-readable stuff') global opts opts, args = parser.parse_args(argv) if opts.in_deriver: (dvju_xml_path, scandata_path) = args iabook = DeriverBook(dvju_xml_path, scandata_path) else: doc = '' callback = None if len(args) == 4: (item_id, doc, path, callback) = args elif len(args) == 3: (item_id, doc, path) = args else: (book_id,) = args doc = path = book_id book_id = args[0] iabook = Book(book_id, doc, path) global scandata_ns scandata_ns = iabook.get_scandata_ns() if djvu: pages = iabook.get_pages_as_djvu() else: pages = iabook.get_pages_as_abbyy() pages = filter(pages) pages = annotate(pages) def clear_page(page): page.clear() windowed_pages = windowed_iterator(pages, 5, clear_page) pages = analyze(windowed_pages) if opts.simpletoc: toc_result = make_toc.simple_make_toc(iabook, pages) else: toc_result = make_toc.make_toc(iabook, pages, hardcode_toc_pages, hardcode_nottoc_pages) # toc_result['readable'] = print_readable(toc_result['qdtoc']) if opts.simpletoc: xml = toc_to_xml.make_xml(toc_result['qdtoc_tuples'], version) print etree.tostring(xml, pretty_print=True), # pretty_print adds nl elif opts.human: for r in ('readable', 'comments', 'isok'): print r + ':' print toc_result[r] print else: if callback is not None: print '%s(' % callback print json.dumps(toc_result) # print_one_per_line(qdtoc) if callback is not None: print ')' else: print json.dumps(toc_result, indent=4)