예제 #1
0
def main(argv):
    import optparse
    parser = optparse.OptionParser(usage='usage: %prog [options]',
                                   version='%prog 0.1',
                                   description='make tocs')
    parser.add_option('--human',
                      action='store_true',
                      default=False,
                      help='print some human-readable stuff')
    global opts
    opts, args = parser.parse_args(argv)

    doc = ''
    callback = None
    if len(args) == 4:
        (item_id, doc, path, callback) = args
    elif len(args) == 3:
        (item_id, doc, path) = args
    else:
        (book_id, ) = args
        path = book_id

    book_id = args[0]
    iabook = Book(book_id, doc, path)
    global scandata_ns
    scandata_ns = iabook.get_scandata_ns()
    if djvu:
        pages = iabook.get_pages_as_djvu()
    else:
        pages = iabook.get_pages_as_abbyy()
    pages = filter(pages)
    pages = annotate(pages)

    def clear_page(page):
        page.clear()

    windowed_pages = windowed_iterator(pages, 5, clear_page)
    pages = analyze(windowed_pages)
    toc_result = make_toc.make_toc(iabook, pages, hardcode_toc_pages,
                                   hardcode_nottoc_pages)

    toc_result['contents_leafnos'] = iabook.get_contents_indices()

    toc_result['readable'] = print_readable(toc_result['qdtoc'])

    if opts.human:
        for r in ('readable', 'comments', 'isok'):
            print r + ':'
            print toc_result[r]
            print
    else:
        if callback is not None:
            print '%s(' % callback
            print json.dumps(toc_result)
        if callback is not None:
            print ')'
        else:
            print json.dumps(toc_result, indent=4)
예제 #2
0
def main(argv):
    import optparse
    parser = optparse.OptionParser(usage='usage: %prog [options]',
                                   version='%prog 0.1',
                                   description='make tocs')
    parser.add_option('--human',
                      action='store_true',
                      default=False,
                      help='print some human-readable stuff')
    global opts
    opts, args = parser.parse_args(argv)

    doc = ''
    callback = None
    if len(args) == 4:
        (item_id, doc, path, callback) = args
    elif len(args) == 3:
        (item_id, doc, path) = args
    else:
        (book_id,) = args
        path = book_id

    book_id = args[0]
    iabook = Book(book_id, doc, path)
    global scandata_ns
    scandata_ns = iabook.get_scandata_ns()
    if djvu:
        pages = iabook.get_pages_as_djvu()
    else:
        pages = iabook.get_pages_as_abbyy()
    pages = filter(pages)
    pages = annotate(pages)
    def clear_page(page):
        page.clear()
    windowed_pages = windowed_iterator(pages, 5, clear_page)
    pages = analyze(windowed_pages)
    toc_result = make_toc.make_toc(iabook, pages, hardcode_toc_pages, hardcode_nottoc_pages)

    toc_result['contents_leafnos'] = iabook.get_contents_indices()

    toc_result['readable'] = print_readable(toc_result['qdtoc'])

    if opts.human:
        for r in ('readable', 'comments', 'isok'):
            print r + ':'
            print toc_result[r]
            print
    else:
        if callback is not None:
            print '%s(' % callback
            print json.dumps(toc_result)
        if callback is not None:
            print ')'
        else:
            print json.dumps(toc_result, indent=4)
예제 #3
0
def local_monotonic_p(seq):
    w = windowed_iterator(seq, 1)
    acceptable = 1
    for p in w:
        if p is 0:
            yield False
        diffs = 0
        expected = 0
        for n in w.neighbors(1):
            if n is not 0:
                diffs += abs(p - n)
                expected += 1
        if expected > 0 and diffs <= expected + acceptable:
            yield True
        else:
            yield False
예제 #4
0
def local_monotonic_p(seq):
    w = windowed_iterator(seq, 1)
    acceptable = 1
    for p in w:
        if p is 0:
            yield False
        diffs = 0
        expected = 0
        for n in w.neighbors(1):
            if n is not 0:
                diffs += abs(p - n)
                expected += 1
        if expected > 0 and diffs <= expected + acceptable:
            yield True
        else:
            yield False
def main(args):
    path = args[0]
    pageNum = int(args[1])
    callback = args[2]

    if not re.match('^/\d{1,2}/items/.+_djvu.xml$', path):
        sys.exit(-1)

    if ('ttsNextPageCB' != callback):
        callback = 'ttsStartCB'

    f = open(path)
    context = etree.iterparse(f, tag='OBJECT')

    def drop_event(iter):
        for event, page in iter:
            yield page

    pages = drop_event(context)

    def clear_page(page):
        page.clear()

    pages = windowed_iterator(pages, windowsize, clear_page)
    for i, page in enumerate(pages):
        if i == pageNum:
            break
    hfs = guess_hfs(page, pages)

    lines = page.findall('.//LINE')

    #print 'got %s .//lines' % len(lines)

    textBlocks = []
    block = ''
    rects = []

    numWords = 0

    for line in lines:
        # skip headers/footers
        if line in hfs:
            continue

        top = sys.maxint
        left = sys.maxint
        right = -1
        bottom = -1

        numWordsInLine = 0

        words = line.findall('.//WORD')

        #print 'at start of line, rects ='
        #print rects

        for word in words:

            numWordsInLine += 1

            text = word.text
            #print 'got text ' + text

            coords = word.get('coords').split(',')  #l,b,r,t
            coords = map(int, coords)

            if int(coords[0]) < left:
                left = coords[0]

            if coords[1] > bottom:
                bottom = coords[1]

            if coords[2] > right:
                right = coords[2]

            if coords[3] < top:
                top = coords[3]

            block += word.text + ' '
            numWords += 1

            if text.endswith('.') and (numWords > minWordsInBlock):
                #print 'end of block with numWords=%d' % numWords
                #print 'block = ' + block

                rects.append([left, bottom, right, top])

                #textBlocks.append(block.strip())
                rects.insert(0, block.strip())
                textBlocks.append(rects)
                block = ''
                rects = []
                numWords = 0
                numWordsInLine = 0
                top = sys.maxint
                left = sys.maxint
                right = -1
                bottom = -1

        #end of line
        if numWordsInLine > 0:
            rects.append([left, bottom, right, top])

        if numWords > maxWordsInBlock:
            #textBlocks.append(block.strip())
            rects.insert(0, block.strip())
            textBlocks.append(rects)
            block = ''
            numWords = 0
            rects = []

        #print 'at end of line, rects ='
        #print rects

    if '' != block:
        #textBlocks.append(block.strip())
        rects.insert(0, block.strip())
        textBlocks.append(rects)

    print 'br.%s(%s);' % (callback, json.dumps(textBlocks))
예제 #6
0
def main(argv):
    import optparse
    parser = optparse.OptionParser(usage='usage: %prog [options]',
                                   version='%prog 0.1',
                                   description='make tocs')
    parser.add_option('--in_deriver',
                      action='store_true',
                      default=False,
                      help='match deriver-style args')
    parser.add_option('--simpletoc',
                      action='store_true',
                      default=False,
                      help='do a simple toc analysis')
    parser.add_option('--human',
                      action='store_true',
                      default=False,
                      help='print some human-readable stuff')
    global opts
    opts, args = parser.parse_args(argv)

    if opts.in_deriver:
        (dvju_xml_path, scandata_path) = args
        iabook = DeriverBook(dvju_xml_path, scandata_path)
    else:
        doc = ''
        callback = None
        if len(args) == 4:
            (item_id, doc, path, callback) = args
        elif len(args) == 3:
            (item_id, doc, path) = args
        else:
            (book_id, ) = args
            doc = path = book_id
        book_id = args[0]
        iabook = Book(book_id, doc, path)
    global scandata_ns
    scandata_ns = iabook.get_scandata_ns()
    if djvu:
        pages = iabook.get_pages_as_djvu()
    else:
        pages = iabook.get_pages_as_abbyy()
    pages = filter(pages)
    pages = annotate(pages)

    def clear_page(page):
        page.clear()

    windowed_pages = windowed_iterator(pages, 5, clear_page)
    pages = analyze(windowed_pages)

    if opts.simpletoc:
        toc_result = make_toc.simple_make_toc(iabook, pages)
    else:
        toc_result = make_toc.make_toc(iabook, pages, hardcode_toc_pages,
                                       hardcode_nottoc_pages)

    # toc_result['readable'] = print_readable(toc_result['qdtoc'])

    if opts.simpletoc:
        xml = toc_to_xml.make_xml(toc_result['qdtoc_tuples'], version)
        print etree.tostring(xml, pretty_print=True),  # pretty_print adds nl
    elif opts.human:
        for r in ('readable', 'comments', 'isok'):
            print r + ':'
            print toc_result[r]
            print
    else:
        if callback is not None:
            print '%s(' % callback
            print json.dumps(toc_result)
            # print_one_per_line(qdtoc)
        if callback is not None:
            print ')'
        else:
            print json.dumps(toc_result, indent=4)
예제 #7
0
def main(args):
    path = args[0]
    pageNum = int(args[1])
    callback = args[2]

    if not re.match("^/\d{1,2}/items/.+_djvu.xml$", path):
        sys.exit(-1)

    if "ttsNextPageCB" != callback:
        callback = "ttsStartCB"

    f = open(path)
    context = etree.iterparse(f, tag="OBJECT")

    def drop_event(iter):
        for event, page in iter:
            yield page

    pages = drop_event(context)

    def clear_page(page):
        page.clear()

    pages = windowed_iterator(pages, windowsize, clear_page)
    for i, page in enumerate(pages):
        if i == pageNum:
            break
    hfs = guess_hfs(page, pages)

    lines = page.findall(".//LINE")

    # print 'got %s .//lines' % len(lines)

    textBlocks = []
    block = ""
    rects = []

    numWords = 0

    for line in lines:
        # skip headers/footers
        if line in hfs:
            continue

        top = sys.maxint
        left = sys.maxint
        right = -1
        bottom = -1

        numWordsInLine = 0

        words = line.findall(".//WORD")

        # print 'at start of line, rects ='
        # print rects

        for word in words:

            numWordsInLine += 1

            text = word.text
            # print 'got text ' + text

            coords = word.get("coords").split(",")  # l,b,r,t
            coords = map(int, coords)

            if int(coords[0]) < left:
                left = coords[0]

            if coords[1] > bottom:
                bottom = coords[1]

            if coords[2] > right:
                right = coords[2]

            if coords[3] < top:
                top = coords[3]

            block += word.text + " "
            numWords += 1

            if text.endswith(".") and (numWords > minWordsInBlock):
                # print 'end of block with numWords=%d' % numWords
                # print 'block = ' + block

                rects.append([left, bottom, right, top])

                # textBlocks.append(block.strip())
                rects.insert(0, block.strip())
                textBlocks.append(rects)
                block = ""
                rects = []
                numWords = 0
                numWordsInLine = 0
                top = sys.maxint
                left = sys.maxint
                right = -1
                bottom = -1

        # end of line
        if numWordsInLine > 0:
            rects.append([left, bottom, right, top])

        if numWords > maxWordsInBlock:
            # textBlocks.append(block.strip())
            rects.insert(0, block.strip())
            textBlocks.append(rects)
            block = ""
            numWords = 0
            rects = []

        # print 'at end of line, rects ='
        # print rects

    if "" != block:
        # textBlocks.append(block.strip())
        rects.insert(0, block.strip())
        textBlocks.append(rects)

    print "br.%s(%s);" % (callback, json.dumps(textBlocks))
예제 #8
0
def main(argv):
    import optparse
    parser = optparse.OptionParser(usage='usage: %prog [options]',
                                   version='%prog 0.1',
                                   description='make tocs')
    parser.add_option('--in_deriver',
                      action='store_true',
                      default=False,
                      help='match deriver-style args')
    parser.add_option('--simpletoc',
                      action='store_true',
                      default=False,
                      help='do a simple toc analysis')
    parser.add_option('--human',
                      action='store_true',
                      default=False,
                      help='print some human-readable stuff')
    global opts
    opts, args = parser.parse_args(argv)

    if opts.in_deriver:
        (dvju_xml_path, scandata_path) = args
        iabook = DeriverBook(dvju_xml_path, scandata_path)
    else:
        doc = ''
        callback = None
        if len(args) == 4:
            (item_id, doc, path, callback) = args
        elif len(args) == 3:
            (item_id, doc, path) = args
        else:
            (book_id,) = args
            doc = path = book_id
        book_id = args[0]
        iabook = Book(book_id, doc, path)
    global scandata_ns
    scandata_ns = iabook.get_scandata_ns()
    if djvu:
        pages = iabook.get_pages_as_djvu()
    else:
        pages = iabook.get_pages_as_abbyy()
    pages = filter(pages)
    pages = annotate(pages)
    def clear_page(page):
        page.clear()
    windowed_pages = windowed_iterator(pages, 5, clear_page)
    pages = analyze(windowed_pages)

    if opts.simpletoc:
        toc_result = make_toc.simple_make_toc(iabook, pages)
    else:
        toc_result = make_toc.make_toc(iabook, pages, hardcode_toc_pages, hardcode_nottoc_pages)

    # toc_result['readable'] = print_readable(toc_result['qdtoc'])

    if opts.simpletoc:
        xml = toc_to_xml.make_xml(toc_result['qdtoc_tuples'], version)
        print etree.tostring(xml, pretty_print=True), # pretty_print adds nl
    elif opts.human:
        for r in ('readable', 'comments', 'isok'):
            print r + ':'
            print toc_result[r]
            print
    else:
        if callback is not None:
            print '%s(' % callback
            print json.dumps(toc_result)
            # print_one_per_line(qdtoc)
        if callback is not None:
            print ')'
        else:
            print json.dumps(toc_result, indent=4)