示例#1
0
def simple_make_toc(iabook, pages):
    result = {
        'isok': True,
        'has_contents': True,
        'has_pagenos': True,
        'qdtoc': [],
        'qdtoc_tuples': [],
    }
    contentscount = iabook.get_contents_count()
    if contentscount == 0:
        result['has_contents'] = False
        return result
    if not iabook.has_pagenos():
        result['has_pagenos'] = False
    tcs = []
    for page in pages:
        if page.info['type'] == 'contents':
            tcs.append(TocCandidate(page))
            tcs[-1].score = 50
            if len(tcs) == contentscount:
                break

    # (for qdtoc algorithm)
    # Go through all tc candidates
    # - append to toc struct based on matches / nearno
    # - collect from tc: pageno cands = pageno_cands / pageno_cands_unfiltered
    all_pageno_cands = []
    for i, tc in enumerate(tcs):
        if i == 0:
            all_pageno_cands += tc.pageno_cands
        else:
            all_pageno_cands += tc.pageno_cands_unfiltered

    # pull sorted increasing set from pageno_cands - hopefully this'll
    # filter out chaff pagenos, as ToCs always have increasing page
    # numbers.
    if len(all_pageno_cands) > 0:
        all_pageno_cands_f = extract_sorted(all_pageno_cands)

        for i, tc in enumerate(tcs):
            toc_els, tuple_toc_els = tc.get_qdtoc(all_pageno_cands_f, iabook)
            result['qdtoc'] += toc_els
            result['qdtoc_tuples'] += tuple_toc_els
        result['qdtoc'] = cleanup_toc(result['qdtoc'])
    return result
示例#2
0
def simple_make_toc(iabook, pages):
    result = { 'isok': True,
               'has_contents': True,
               'has_pagenos': True,
               'qdtoc': [],
               'qdtoc_tuples': [],
               }
    contentscount = iabook.get_contents_count()
    if contentscount == 0:
        result['has_contents'] = False
        return result
    if not iabook.has_pagenos():
        result['has_pagenos'] = False
    tcs = []
    for page in pages:
        if page.info['type'] == 'contents':
            tcs.append(TocCandidate(page))
            tcs[-1].score = 50
            if len(tcs) == contentscount:
                break

    # (for qdtoc algorithm)
    # Go through all tc candidates
    # - append to toc struct based on matches / nearno
    # - collect from tc: pageno cands = pageno_cands / pageno_cands_unfiltered
    all_pageno_cands = []
    for i, tc in enumerate(tcs):
        if i == 0:
            all_pageno_cands += tc.pageno_cands
        else:
            all_pageno_cands += tc.pageno_cands_unfiltered

    # pull sorted increasing set from pageno_cands - hopefully this'll
    # filter out chaff pagenos, as ToCs always have increasing page
    # numbers.
    if len(all_pageno_cands) > 0:
        all_pageno_cands_f = extract_sorted(all_pageno_cands)

        for i, tc in enumerate(tcs):
            toc_els, tuple_toc_els = tc.get_qdtoc(all_pageno_cands_f, iabook)
            result['qdtoc'] += toc_els
            result['qdtoc_tuples'] += tuple_toc_els
        result['qdtoc'] = cleanup_toc(result['qdtoc'])
    return result
示例#3
0
def make_toc(iabook, pages, contents_leafs=None, not_contents_leafs=None):
    result = { 'isok': True,
               'has_contents': True,
               'has_pagenos': True,
               'comments':'http://archive.org/stream/%s\n' % (iabook.book_id),
               'toc':[],
               'qdtoc':[]
               }
    result['comments'] += 'http://openlibrary.org/ia/%s\n' % (iabook.book_id)
    contentscount = iabook.get_contents_count()
    if contentscount == 0:
        result['has_contents'] = False
    if not iabook.has_pagenos():
        result['has_pagenos'] = False

    # XXXmm these might help with speed

#     if iabook.has_pagenos():
#         result['failedbkno'] = 'nope'
#         failit(result, 'already saw these books')
#         return result


#     if not iabook.has_pagenos():
#         result['failedbkno'] = 'nope'
#         failit(result, 'failed bc no pagenos marked')
#         return result

#     if contentscount == 0:
#         result['failedbkno'] = 'nope'
#         failit(result, 'failed bc no contents page marked')
#         return result


    # loop through all pages.  For some set of early pages, make
    # TocCandidate objs - and compare each subsequent page to each
    # TocCandidate created so far, subject to some rules intended to
    # speed things up.  Accumulate TocCandidate scores / info

    # XXX detect title page and only accept contents, if after?

    tcs = []
    toc_threshold = 3 # if some tc page goes above this score, then...

    # skip early book pages altogether
    skip_pages_until_index = 0

    # Try this many toc pages (starting at skip_pages) if none marked
    toc_count_to_try_if_no_contents_info = 40 # 20

    # Try this many toc pages if 1 is marked
    toc_count_to_try_if_contents_info = 40 # 8

    # fail if more than this many contiguous good toc pages are seen
    # should be less than two vars above...
    max_allowed_toc_len = 40 # 7

    # these values control comparing book pages to a sliding window of
    # already-found toc pages.  Can result in early toc candidates
    # being 'shadowed' by spurious later ones - e.g. lists of
    # illustration.
    windowed_match = False

    # first candidate toc page to compare subsequent pages to; this is
    # adjusted upward as new candidate pages are found.
    toc_window_start = 0

    # at most this many toc pages are checked against current body
    # page forward of the most recently threshold-making toc
    toc_window_size = 1

    toc_window_end = sys.maxint

    # the last tocpage may be below the threshold, but is only kept if
    # it's above this
    trailing_tocpage_threshold = 1

    for page in pages:
		# XXX remove?
        if (len(page.info['number']) == 0
            and 'pageno_guess' in page.info):
            page.info['number'] = str(page.info['pageno_guess'])

        # skip on if we're waiting for a contents page
        if (len(tcs) == 0
            and contentscount > 0
            and page.info['type'] != 'contents'):
            continue
        if page.index < skip_pages_until_index:
            continue
        # l("%s: %s %s" % (page.index, page.info['number'],
        #                      page.info['type']))
        good_toc_count = 0
        for i, tc in enumerate(tcs):
            if windowed_match:
                if i < toc_window_start:
                    continue
                if i > toc_window_end:
                    continue
            tc.match_page(page)
            if tc.score > toc_threshold:
                toc_window_start = i
                if toc_window_end < toc_window_start + toc_window_size:
                    toc_window_end = toc_window_start + toc_window_size

        # append the current page as a toc candidate...
        # if contentscount is...
        # 0: build contents pages for first n pages
        # 2+: build contents pages exactly for designated pages
        # 1: skip pages before 1rst contents page, build n_optimistic pages, remember to score later pages
        if ((contentscount == 0 and len(tcs) < toc_count_to_try_if_no_contents_info)
            or (contentscount == 1 and len(tcs) < toc_count_to_try_if_contents_info)
            or (contentscount > 1 and len(tcs) < contentscount
                and page.info['type'] == 'contents')
            or (contents_leafs and page.index in contents_leafs)):
                tcs.append(TocCandidate(page))
        # break early if contents_leafs is around
        if contents_leafs and page.index - 1 in contents_leafs and not page.index in contents_leafs:
            break
        # support hardcode_toc_pages
        if contents_leafs and page.index in contents_leafs:
            tcs[-1].score = 10
        if not_contents_leafs and page.index in not_contents_leafs:
            tcs[-1].score = -10

    # adjust tc scores to promote trailing pages
    saw_good_score = False
    for i, tc in enumerate(tcs):
        tc.adj_score = tc.score
        if not saw_good_score:
            if tc.score > toc_threshold:
                saw_good_score = True
                continue
        if (tc.score < toc_threshold and
            tc.score > trailing_tocpage_threshold and
            i + 1 < len(tcs) and
            tcs[i + 1].score < toc_threshold):
            tc.adj_score = toc_threshold
            break

    # (for qdtoc algorithm)
    # Go through all tc candidates
    # - append to toc struct based on matches / nearno
    # - collect from tc: pageno cands = pageno_cands / pageno_cands_unfiltered
    all_pageno_cands = []
    for i, tc in enumerate(tcs):
        score = tc.score

        # if tc is not last, but *next* tc is below threshold,
        # accept if better than trailing_tocpage_threshold.
        if i + 1 < len(tcs):
            if (tcs[i + 1].score < toc_threshold and
                tc.score >= trailing_tocpage_threshold):
                score = toc_threshold

        if score >= toc_threshold:
            for match in tc.matches:
                info = tc.matchinfo[match]
                tocitem_words = info.matchwords()
                pagenum = info.nearno
                if info.nearno != -1:
                    pagenum = tc.words[pagenum]
                labelwords, titlewords = guess_label(tocitem_words)
                result['toc'].append({'level':1, 'label':(' '.join(labelwords)).strip(),
                                      'title':(' '.join(titlewords)).strip(), 'pagenum':pagenum,
                                      'tocpage':i
                                      })
            if i == 0:
                all_pageno_cands += tc.pageno_cands
            else:
                all_pageno_cands += tc.pageno_cands_unfiltered

    # pull sorted increasing set from pageno_cands - hopefully this'll
    # filter out chaff pagenos, as ToCs always have increasing page
    # numbers.
    if len(all_pageno_cands) > 0:
        all_pageno_cands_f = extract_sorted(all_pageno_cands)

        # create qdtoc:
        # loop through toc candidates and append qdtoc from each.
        most_recent_toc = 0
        for i, tc in enumerate(tcs):
            l(result, '%s %s' % (tc.page.index, tc.score))
            if tc.score > toc_threshold:
                good_toc_count += 1
                if good_toc_count >= max_allowed_toc_len:
                    failit(result, 'failed due to too many toc pages')
                    return result
                if most_recent_toc != 0:
                    if i >= most_recent_toc + 2:
                        failit(result, 'failed due to discontiguous tocs')
                        return result
                    most_recent_toc = i
                toc_els, tuple_toc_els = tc.get_qdtoc(all_pageno_cands_f)
                result['qdtoc'] += toc_els

        result['qdtoc'] = cleanup_toc(result['qdtoc'])
        check_toc(result)
    return result
示例#4
0
    def find_pageno_cands(self, filtered=True):
        """ Find the set of all numbers on this page that don't
        e.g. follow 'chapter', then find the largest increasing subset
        of these numbers; we consider it likely that these'll be book
        page numbers """
        # XXX i18n someday
        labels = ('chapter', 'part', 'section', 'book')
        if 'bounds' in self.page.info:
            bounds = self.page.info['bounds']
        else:
            bounds = self.page.find_text_bounds()
        xcutoff = bounds.l + (bounds.r - bounds.l) * PAGENO_X_RATIO
        def get_page_cands(page):
            # XXX mini stack to handle last-3 words ?
            lastword = ''

            for i, word in enumerate(self.wordtuples):
                box = word.box
                word = word.text
                if box.l < xcutoff or lastword in labels:
                    lastword = word
                    continue
                r = rnum_to_int(word)
                if r != 0:
                    yield pagenocand('0roman', r, i, self.page.index)
                for w in (word, # lastword + word
                          ):
                # for w in (word, lastword + word):
                    # XXX should add on-same-line check to above
                    # XXX this doesn't work currently, as it doesn't
                    # get reflected in output!
                    if w.isdigit():
                        val = int(w)
                        if val < 999:
                            # XXX replace above check with book page count
                            # if avail.
                            yield pagenocand('1arabic', int(w), i, self.page.index)
                lastword = word
        def printword(c):
            ar, val, loc = c
            print "%s %s - %s" % (ar, val, self.words[loc])
        page_cands = [c for c in get_page_cands(self.page)]

        result = page_cands

        if filtered and len(page_cands) > 0:
            extracted_orig = extract_sorted(page_cands)
            result = extracted_orig

        if filtered and less_greedy_heuristic:
            if len(page_cands) != 0:
                slop = 1
                if (len(extracted_orig) > 3
                    and extracted_orig[0].val + 2 >= extracted_orig[1].val
                    and extracted_orig[1].val + 2 >= extracted_orig[2].val):
                    page_cands_filtered = []
                    for c in page_cands:
                        if c != extracted_orig[1] and c != extracted_orig[2]:
                            page_cands_filtered.append(c)
                    extracted_filtered = extract_sorted(page_cands_filtered)
                    # did we just lose a pair we wanted?
                    # see if extracted_filtered is the same, but for the missing two
                    unchanged_sort = False
                    if (len(extracted_filtered) + 2 == len(extracted_orig)
                        and extracted_filtered[0] == extracted_orig[0]):
                        i = 1
                        while i < len(extracted_orig) - 2:
                            if extracted_filtered[i] != extracted_orig[i + 2]:
                                break
                            i += 1
                        unchanged_sort = True
                    if (unchanged_sort
                        or len(extracted_orig) + 4 < len(extracted_filtered)):
                        # too much loss, go with orig
                        result = extracted_orig
                    else:
                        result = extracted_filtered
                        # print extracted_filtered
                    # result = extracted_filtered
                    # print extracted_filtered
        return result
示例#5
0
def make_toc(iabook, pages, contents_leafs=None, not_contents_leafs=None):
    result = {
        'isok': True,
        'has_contents': True,
        'has_pagenos': True,
        'comments': 'http://archive.org/stream/%s\n' % (iabook.book_id),
        'toc': [],
        'qdtoc': []
    }
    result['comments'] += 'http://openlibrary.org/ia/%s\n' % (iabook.book_id)
    contentscount = iabook.get_contents_count()
    if contentscount == 0:
        result['has_contents'] = False
    if not iabook.has_pagenos():
        result['has_pagenos'] = False

    # XXXmm these might help with speed

#     if iabook.has_pagenos():
#         result['failedbkno'] = 'nope'
#         failit(result, 'already saw these books')
#         return result

#     if not iabook.has_pagenos():
#         result['failedbkno'] = 'nope'
#         failit(result, 'failed bc no pagenos marked')
#         return result

#     if contentscount == 0:
#         result['failedbkno'] = 'nope'
#         failit(result, 'failed bc no contents page marked')
#         return result

# loop through all pages.  For some set of early pages, make
# TocCandidate objs - and compare each subsequent page to each
# TocCandidate created so far, subject to some rules intended to
# speed things up.  Accumulate TocCandidate scores / info

# XXX detect title page and only accept contents, if after?

    tcs = []
    toc_threshold = 3  # if some tc page goes above this score, then...

    # skip early book pages altogether
    skip_pages_until_index = 0

    # Try this many toc pages (starting at skip_pages) if none marked
    toc_count_to_try_if_no_contents_info = 40  # 20

    # Try this many toc pages if 1 is marked
    toc_count_to_try_if_contents_info = 40  # 8

    # fail if more than this many contiguous good toc pages are seen
    # should be less than two vars above...
    max_allowed_toc_len = 40  # 7

    # these values control comparing book pages to a sliding window of
    # already-found toc pages.  Can result in early toc candidates
    # being 'shadowed' by spurious later ones - e.g. lists of
    # illustration.
    windowed_match = False

    # first candidate toc page to compare subsequent pages to; this is
    # adjusted upward as new candidate pages are found.
    toc_window_start = 0

    # at most this many toc pages are checked against current body
    # page forward of the most recently threshold-making toc
    toc_window_size = 1

    toc_window_end = sys.maxint

    # the last tocpage may be below the threshold, but is only kept if
    # it's above this
    trailing_tocpage_threshold = 1

    for page in pages:
        # XXX remove?
        if (len(page.info['number']) == 0 and 'pageno_guess' in page.info):
            page.info['number'] = str(page.info['pageno_guess'])

        # skip on if we're waiting for a contents page
        if (len(tcs) == 0 and contentscount > 0
                and page.info['type'] != 'contents'):
            continue
        if page.index < skip_pages_until_index:
            continue
        # l("%s: %s %s" % (page.index, page.info['number'],
        #                      page.info['type']))
        good_toc_count = 0
        for i, tc in enumerate(tcs):
            if windowed_match:
                if i < toc_window_start:
                    continue
                if i > toc_window_end:
                    continue
            tc.match_page(page)
            if tc.score > toc_threshold:
                toc_window_start = i
                if toc_window_end < toc_window_start + toc_window_size:
                    toc_window_end = toc_window_start + toc_window_size

        # append the current page as a toc candidate...
        # if contentscount is...
        # 0: build contents pages for first n pages
        # 2+: build contents pages exactly for designated pages
        # 1: skip pages before 1rst contents page, build n_optimistic pages, remember to score later pages
        if ((contentscount == 0
             and len(tcs) < toc_count_to_try_if_no_contents_info)
                or (contentscount == 1
                    and len(tcs) < toc_count_to_try_if_contents_info)
                or (contentscount > 1 and len(tcs) < contentscount
                    and page.info['type'] == 'contents')
                or (contents_leafs and page.index in contents_leafs)):
            tcs.append(TocCandidate(page))
        # break early if contents_leafs is around
        if contents_leafs and page.index - 1 in contents_leafs and not page.index in contents_leafs:
            break
        # support hardcode_toc_pages
        if contents_leafs and page.index in contents_leafs:
            tcs[-1].score = 10
        if not_contents_leafs and page.index in not_contents_leafs:
            tcs[-1].score = -10

    # adjust tc scores to promote trailing pages
    saw_good_score = False
    for i, tc in enumerate(tcs):
        tc.adj_score = tc.score
        if not saw_good_score:
            if tc.score > toc_threshold:
                saw_good_score = True
                continue
        if (tc.score < toc_threshold and tc.score > trailing_tocpage_threshold
                and i + 1 < len(tcs) and tcs[i + 1].score < toc_threshold):
            tc.adj_score = toc_threshold
            break

    # (for qdtoc algorithm)
    # Go through all tc candidates
    # - append to toc struct based on matches / nearno
    # - collect from tc: pageno cands = pageno_cands / pageno_cands_unfiltered
    all_pageno_cands = []
    for i, tc in enumerate(tcs):
        score = tc.score

        # if tc is not last, but *next* tc is below threshold,
        # accept if better than trailing_tocpage_threshold.
        if i + 1 < len(tcs):
            if (tcs[i + 1].score < toc_threshold
                    and tc.score >= trailing_tocpage_threshold):
                score = toc_threshold

        if score >= toc_threshold:
            for match in tc.matches:
                info = tc.matchinfo[match]
                tocitem_words = info.matchwords()
                pagenum = info.nearno
                if info.nearno != -1:
                    pagenum = tc.words[pagenum]
                labelwords, titlewords = guess_label(tocitem_words)
                result['toc'].append({
                    'level': 1,
                    'label': (' '.join(labelwords)).strip(),
                    'title': (' '.join(titlewords)).strip(),
                    'pagenum': pagenum,
                    'tocpage': i
                })
            if i == 0:
                all_pageno_cands += tc.pageno_cands
            else:
                all_pageno_cands += tc.pageno_cands_unfiltered

    # pull sorted increasing set from pageno_cands - hopefully this'll
    # filter out chaff pagenos, as ToCs always have increasing page
    # numbers.
    if len(all_pageno_cands) > 0:
        all_pageno_cands_f = extract_sorted(all_pageno_cands)

        # create qdtoc:
        # loop through toc candidates and append qdtoc from each.
        most_recent_toc = 0
        for i, tc in enumerate(tcs):
            l(result, '%s %s' % (tc.page.index, tc.score))
            if tc.score > toc_threshold:
                good_toc_count += 1
                if good_toc_count >= max_allowed_toc_len:
                    failit(result, 'failed due to too many toc pages')
                    return result
                if most_recent_toc != 0:
                    if i >= most_recent_toc + 2:
                        failit(result, 'failed due to discontiguous tocs')
                        return result
                    most_recent_toc = i
                toc_els, tuple_toc_els = tc.get_qdtoc(all_pageno_cands_f)
                result['qdtoc'] += toc_els

        result['qdtoc'] = cleanup_toc(result['qdtoc'])
        check_toc(result)
    return result
示例#6
0
    def find_pageno_cands(self, filtered=True):
        """ Find the set of all numbers on this page that don't
        e.g. follow 'chapter', then find the largest increasing subset
        of these numbers; we consider it likely that these'll be book
        page numbers """
        # XXX i18n someday
        labels = ('chapter', 'part', 'section', 'book')
        if 'bounds' in self.page.info:
            bounds = self.page.info['bounds']
        else:
            bounds = self.page.find_text_bounds()
        xcutoff = bounds.l + (bounds.r - bounds.l) * PAGENO_X_RATIO

        def get_page_cands(page):
            # XXX mini stack to handle last-3 words ?
            lastword = ''

            for i, word in enumerate(self.wordtuples):
                box = word.box
                word = word.text
                if box.l < xcutoff or lastword in labels:
                    lastword = word
                    continue
                r = rnum_to_int(word)
                if r != 0:
                    yield pagenocand('0roman', r, i, self.page.index)
                for w in (
                        word,  # lastword + word
                ):
                    # for w in (word, lastword + word):
                    # XXX should add on-same-line check to above
                    # XXX this doesn't work currently, as it doesn't
                    # get reflected in output!
                    if w.isdigit():
                        val = int(w)
                        if val < 999:
                            # XXX replace above check with book page count
                            # if avail.
                            yield pagenocand('1arabic', int(w), i,
                                             self.page.index)
                lastword = word

        def printword(c):
            ar, val, loc = c
            print "%s %s - %s" % (ar, val, self.words[loc])

        page_cands = [c for c in get_page_cands(self.page)]

        result = page_cands

        if filtered and len(page_cands) > 0:
            extracted_orig = extract_sorted(page_cands)
            result = extracted_orig

        if filtered and less_greedy_heuristic:
            if len(page_cands) != 0:
                slop = 1
                if (len(extracted_orig) > 3
                        and extracted_orig[0].val + 2 >= extracted_orig[1].val
                        and
                        extracted_orig[1].val + 2 >= extracted_orig[2].val):
                    page_cands_filtered = []
                    for c in page_cands:
                        if c != extracted_orig[1] and c != extracted_orig[2]:
                            page_cands_filtered.append(c)
                    extracted_filtered = extract_sorted(page_cands_filtered)
                    # did we just lose a pair we wanted?
                    # see if extracted_filtered is the same, but for the missing two
                    unchanged_sort = False
                    if (len(extracted_filtered) + 2 == len(extracted_orig)
                            and extracted_filtered[0] == extracted_orig[0]):
                        i = 1
                        while i < len(extracted_orig) - 2:
                            if extracted_filtered[i] != extracted_orig[i + 2]:
                                break
                            i += 1
                        unchanged_sort = True
                    if (unchanged_sort or
                            len(extracted_orig) + 4 < len(extracted_filtered)):
                        # too much loss, go with orig
                        result = extracted_orig
                    else:
                        result = extracted_filtered
                        # print extracted_filtered
                    # result = extracted_filtered
                    # print extracted_filtered
        return result