def simple_make_toc(iabook, pages): result = { 'isok': True, 'has_contents': True, 'has_pagenos': True, 'qdtoc': [], 'qdtoc_tuples': [], } contentscount = iabook.get_contents_count() if contentscount == 0: result['has_contents'] = False return result if not iabook.has_pagenos(): result['has_pagenos'] = False tcs = [] for page in pages: if page.info['type'] == 'contents': tcs.append(TocCandidate(page)) tcs[-1].score = 50 if len(tcs) == contentscount: break # (for qdtoc algorithm) # Go through all tc candidates # - append to toc struct based on matches / nearno # - collect from tc: pageno cands = pageno_cands / pageno_cands_unfiltered all_pageno_cands = [] for i, tc in enumerate(tcs): if i == 0: all_pageno_cands += tc.pageno_cands else: all_pageno_cands += tc.pageno_cands_unfiltered # pull sorted increasing set from pageno_cands - hopefully this'll # filter out chaff pagenos, as ToCs always have increasing page # numbers. if len(all_pageno_cands) > 0: all_pageno_cands_f = extract_sorted(all_pageno_cands) for i, tc in enumerate(tcs): toc_els, tuple_toc_els = tc.get_qdtoc(all_pageno_cands_f, iabook) result['qdtoc'] += toc_els result['qdtoc_tuples'] += tuple_toc_els result['qdtoc'] = cleanup_toc(result['qdtoc']) return result
def make_toc(iabook, pages, contents_leafs=None, not_contents_leafs=None): result = { 'isok': True, 'has_contents': True, 'has_pagenos': True, 'comments':'http://archive.org/stream/%s\n' % (iabook.book_id), 'toc':[], 'qdtoc':[] } result['comments'] += 'http://openlibrary.org/ia/%s\n' % (iabook.book_id) contentscount = iabook.get_contents_count() if contentscount == 0: result['has_contents'] = False if not iabook.has_pagenos(): result['has_pagenos'] = False # XXXmm these might help with speed # if iabook.has_pagenos(): # result['failedbkno'] = 'nope' # failit(result, 'already saw these books') # return result # if not iabook.has_pagenos(): # result['failedbkno'] = 'nope' # failit(result, 'failed bc no pagenos marked') # return result # if contentscount == 0: # result['failedbkno'] = 'nope' # failit(result, 'failed bc no contents page marked') # return result # loop through all pages. For some set of early pages, make # TocCandidate objs - and compare each subsequent page to each # TocCandidate created so far, subject to some rules intended to # speed things up. Accumulate TocCandidate scores / info # XXX detect title page and only accept contents, if after? tcs = [] toc_threshold = 3 # if some tc page goes above this score, then... # skip early book pages altogether skip_pages_until_index = 0 # Try this many toc pages (starting at skip_pages) if none marked toc_count_to_try_if_no_contents_info = 40 # 20 # Try this many toc pages if 1 is marked toc_count_to_try_if_contents_info = 40 # 8 # fail if more than this many contiguous good toc pages are seen # should be less than two vars above... max_allowed_toc_len = 40 # 7 # these values control comparing book pages to a sliding window of # already-found toc pages. Can result in early toc candidates # being 'shadowed' by spurious later ones - e.g. lists of # illustration. windowed_match = False # first candidate toc page to compare subsequent pages to; this is # adjusted upward as new candidate pages are found. toc_window_start = 0 # at most this many toc pages are checked against current body # page forward of the most recently threshold-making toc toc_window_size = 1 toc_window_end = sys.maxint # the last tocpage may be below the threshold, but is only kept if # it's above this trailing_tocpage_threshold = 1 for page in pages: # XXX remove? if (len(page.info['number']) == 0 and 'pageno_guess' in page.info): page.info['number'] = str(page.info['pageno_guess']) # skip on if we're waiting for a contents page if (len(tcs) == 0 and contentscount > 0 and page.info['type'] != 'contents'): continue if page.index < skip_pages_until_index: continue # l("%s: %s %s" % (page.index, page.info['number'], # page.info['type'])) good_toc_count = 0 for i, tc in enumerate(tcs): if windowed_match: if i < toc_window_start: continue if i > toc_window_end: continue tc.match_page(page) if tc.score > toc_threshold: toc_window_start = i if toc_window_end < toc_window_start + toc_window_size: toc_window_end = toc_window_start + toc_window_size # append the current page as a toc candidate... # if contentscount is... # 0: build contents pages for first n pages # 2+: build contents pages exactly for designated pages # 1: skip pages before 1rst contents page, build n_optimistic pages, remember to score later pages if ((contentscount == 0 and len(tcs) < toc_count_to_try_if_no_contents_info) or (contentscount == 1 and len(tcs) < toc_count_to_try_if_contents_info) or (contentscount > 1 and len(tcs) < contentscount and page.info['type'] == 'contents') or (contents_leafs and page.index in contents_leafs)): tcs.append(TocCandidate(page)) # break early if contents_leafs is around if contents_leafs and page.index - 1 in contents_leafs and not page.index in contents_leafs: break # support hardcode_toc_pages if contents_leafs and page.index in contents_leafs: tcs[-1].score = 10 if not_contents_leafs and page.index in not_contents_leafs: tcs[-1].score = -10 # adjust tc scores to promote trailing pages saw_good_score = False for i, tc in enumerate(tcs): tc.adj_score = tc.score if not saw_good_score: if tc.score > toc_threshold: saw_good_score = True continue if (tc.score < toc_threshold and tc.score > trailing_tocpage_threshold and i + 1 < len(tcs) and tcs[i + 1].score < toc_threshold): tc.adj_score = toc_threshold break # (for qdtoc algorithm) # Go through all tc candidates # - append to toc struct based on matches / nearno # - collect from tc: pageno cands = pageno_cands / pageno_cands_unfiltered all_pageno_cands = [] for i, tc in enumerate(tcs): score = tc.score # if tc is not last, but *next* tc is below threshold, # accept if better than trailing_tocpage_threshold. if i + 1 < len(tcs): if (tcs[i + 1].score < toc_threshold and tc.score >= trailing_tocpage_threshold): score = toc_threshold if score >= toc_threshold: for match in tc.matches: info = tc.matchinfo[match] tocitem_words = info.matchwords() pagenum = info.nearno if info.nearno != -1: pagenum = tc.words[pagenum] labelwords, titlewords = guess_label(tocitem_words) result['toc'].append({'level':1, 'label':(' '.join(labelwords)).strip(), 'title':(' '.join(titlewords)).strip(), 'pagenum':pagenum, 'tocpage':i }) if i == 0: all_pageno_cands += tc.pageno_cands else: all_pageno_cands += tc.pageno_cands_unfiltered # pull sorted increasing set from pageno_cands - hopefully this'll # filter out chaff pagenos, as ToCs always have increasing page # numbers. if len(all_pageno_cands) > 0: all_pageno_cands_f = extract_sorted(all_pageno_cands) # create qdtoc: # loop through toc candidates and append qdtoc from each. most_recent_toc = 0 for i, tc in enumerate(tcs): l(result, '%s %s' % (tc.page.index, tc.score)) if tc.score > toc_threshold: good_toc_count += 1 if good_toc_count >= max_allowed_toc_len: failit(result, 'failed due to too many toc pages') return result if most_recent_toc != 0: if i >= most_recent_toc + 2: failit(result, 'failed due to discontiguous tocs') return result most_recent_toc = i toc_els, tuple_toc_els = tc.get_qdtoc(all_pageno_cands_f) result['qdtoc'] += toc_els result['qdtoc'] = cleanup_toc(result['qdtoc']) check_toc(result) return result
def find_pageno_cands(self, filtered=True): """ Find the set of all numbers on this page that don't e.g. follow 'chapter', then find the largest increasing subset of these numbers; we consider it likely that these'll be book page numbers """ # XXX i18n someday labels = ('chapter', 'part', 'section', 'book') if 'bounds' in self.page.info: bounds = self.page.info['bounds'] else: bounds = self.page.find_text_bounds() xcutoff = bounds.l + (bounds.r - bounds.l) * PAGENO_X_RATIO def get_page_cands(page): # XXX mini stack to handle last-3 words ? lastword = '' for i, word in enumerate(self.wordtuples): box = word.box word = word.text if box.l < xcutoff or lastword in labels: lastword = word continue r = rnum_to_int(word) if r != 0: yield pagenocand('0roman', r, i, self.page.index) for w in (word, # lastword + word ): # for w in (word, lastword + word): # XXX should add on-same-line check to above # XXX this doesn't work currently, as it doesn't # get reflected in output! if w.isdigit(): val = int(w) if val < 999: # XXX replace above check with book page count # if avail. yield pagenocand('1arabic', int(w), i, self.page.index) lastword = word def printword(c): ar, val, loc = c print "%s %s - %s" % (ar, val, self.words[loc]) page_cands = [c for c in get_page_cands(self.page)] result = page_cands if filtered and len(page_cands) > 0: extracted_orig = extract_sorted(page_cands) result = extracted_orig if filtered and less_greedy_heuristic: if len(page_cands) != 0: slop = 1 if (len(extracted_orig) > 3 and extracted_orig[0].val + 2 >= extracted_orig[1].val and extracted_orig[1].val + 2 >= extracted_orig[2].val): page_cands_filtered = [] for c in page_cands: if c != extracted_orig[1] and c != extracted_orig[2]: page_cands_filtered.append(c) extracted_filtered = extract_sorted(page_cands_filtered) # did we just lose a pair we wanted? # see if extracted_filtered is the same, but for the missing two unchanged_sort = False if (len(extracted_filtered) + 2 == len(extracted_orig) and extracted_filtered[0] == extracted_orig[0]): i = 1 while i < len(extracted_orig) - 2: if extracted_filtered[i] != extracted_orig[i + 2]: break i += 1 unchanged_sort = True if (unchanged_sort or len(extracted_orig) + 4 < len(extracted_filtered)): # too much loss, go with orig result = extracted_orig else: result = extracted_filtered # print extracted_filtered # result = extracted_filtered # print extracted_filtered return result
def make_toc(iabook, pages, contents_leafs=None, not_contents_leafs=None): result = { 'isok': True, 'has_contents': True, 'has_pagenos': True, 'comments': 'http://archive.org/stream/%s\n' % (iabook.book_id), 'toc': [], 'qdtoc': [] } result['comments'] += 'http://openlibrary.org/ia/%s\n' % (iabook.book_id) contentscount = iabook.get_contents_count() if contentscount == 0: result['has_contents'] = False if not iabook.has_pagenos(): result['has_pagenos'] = False # XXXmm these might help with speed # if iabook.has_pagenos(): # result['failedbkno'] = 'nope' # failit(result, 'already saw these books') # return result # if not iabook.has_pagenos(): # result['failedbkno'] = 'nope' # failit(result, 'failed bc no pagenos marked') # return result # if contentscount == 0: # result['failedbkno'] = 'nope' # failit(result, 'failed bc no contents page marked') # return result # loop through all pages. For some set of early pages, make # TocCandidate objs - and compare each subsequent page to each # TocCandidate created so far, subject to some rules intended to # speed things up. Accumulate TocCandidate scores / info # XXX detect title page and only accept contents, if after? tcs = [] toc_threshold = 3 # if some tc page goes above this score, then... # skip early book pages altogether skip_pages_until_index = 0 # Try this many toc pages (starting at skip_pages) if none marked toc_count_to_try_if_no_contents_info = 40 # 20 # Try this many toc pages if 1 is marked toc_count_to_try_if_contents_info = 40 # 8 # fail if more than this many contiguous good toc pages are seen # should be less than two vars above... max_allowed_toc_len = 40 # 7 # these values control comparing book pages to a sliding window of # already-found toc pages. Can result in early toc candidates # being 'shadowed' by spurious later ones - e.g. lists of # illustration. windowed_match = False # first candidate toc page to compare subsequent pages to; this is # adjusted upward as new candidate pages are found. toc_window_start = 0 # at most this many toc pages are checked against current body # page forward of the most recently threshold-making toc toc_window_size = 1 toc_window_end = sys.maxint # the last tocpage may be below the threshold, but is only kept if # it's above this trailing_tocpage_threshold = 1 for page in pages: # XXX remove? if (len(page.info['number']) == 0 and 'pageno_guess' in page.info): page.info['number'] = str(page.info['pageno_guess']) # skip on if we're waiting for a contents page if (len(tcs) == 0 and contentscount > 0 and page.info['type'] != 'contents'): continue if page.index < skip_pages_until_index: continue # l("%s: %s %s" % (page.index, page.info['number'], # page.info['type'])) good_toc_count = 0 for i, tc in enumerate(tcs): if windowed_match: if i < toc_window_start: continue if i > toc_window_end: continue tc.match_page(page) if tc.score > toc_threshold: toc_window_start = i if toc_window_end < toc_window_start + toc_window_size: toc_window_end = toc_window_start + toc_window_size # append the current page as a toc candidate... # if contentscount is... # 0: build contents pages for first n pages # 2+: build contents pages exactly for designated pages # 1: skip pages before 1rst contents page, build n_optimistic pages, remember to score later pages if ((contentscount == 0 and len(tcs) < toc_count_to_try_if_no_contents_info) or (contentscount == 1 and len(tcs) < toc_count_to_try_if_contents_info) or (contentscount > 1 and len(tcs) < contentscount and page.info['type'] == 'contents') or (contents_leafs and page.index in contents_leafs)): tcs.append(TocCandidate(page)) # break early if contents_leafs is around if contents_leafs and page.index - 1 in contents_leafs and not page.index in contents_leafs: break # support hardcode_toc_pages if contents_leafs and page.index in contents_leafs: tcs[-1].score = 10 if not_contents_leafs and page.index in not_contents_leafs: tcs[-1].score = -10 # adjust tc scores to promote trailing pages saw_good_score = False for i, tc in enumerate(tcs): tc.adj_score = tc.score if not saw_good_score: if tc.score > toc_threshold: saw_good_score = True continue if (tc.score < toc_threshold and tc.score > trailing_tocpage_threshold and i + 1 < len(tcs) and tcs[i + 1].score < toc_threshold): tc.adj_score = toc_threshold break # (for qdtoc algorithm) # Go through all tc candidates # - append to toc struct based on matches / nearno # - collect from tc: pageno cands = pageno_cands / pageno_cands_unfiltered all_pageno_cands = [] for i, tc in enumerate(tcs): score = tc.score # if tc is not last, but *next* tc is below threshold, # accept if better than trailing_tocpage_threshold. if i + 1 < len(tcs): if (tcs[i + 1].score < toc_threshold and tc.score >= trailing_tocpage_threshold): score = toc_threshold if score >= toc_threshold: for match in tc.matches: info = tc.matchinfo[match] tocitem_words = info.matchwords() pagenum = info.nearno if info.nearno != -1: pagenum = tc.words[pagenum] labelwords, titlewords = guess_label(tocitem_words) result['toc'].append({ 'level': 1, 'label': (' '.join(labelwords)).strip(), 'title': (' '.join(titlewords)).strip(), 'pagenum': pagenum, 'tocpage': i }) if i == 0: all_pageno_cands += tc.pageno_cands else: all_pageno_cands += tc.pageno_cands_unfiltered # pull sorted increasing set from pageno_cands - hopefully this'll # filter out chaff pagenos, as ToCs always have increasing page # numbers. if len(all_pageno_cands) > 0: all_pageno_cands_f = extract_sorted(all_pageno_cands) # create qdtoc: # loop through toc candidates and append qdtoc from each. most_recent_toc = 0 for i, tc in enumerate(tcs): l(result, '%s %s' % (tc.page.index, tc.score)) if tc.score > toc_threshold: good_toc_count += 1 if good_toc_count >= max_allowed_toc_len: failit(result, 'failed due to too many toc pages') return result if most_recent_toc != 0: if i >= most_recent_toc + 2: failit(result, 'failed due to discontiguous tocs') return result most_recent_toc = i toc_els, tuple_toc_els = tc.get_qdtoc(all_pageno_cands_f) result['qdtoc'] += toc_els result['qdtoc'] = cleanup_toc(result['qdtoc']) check_toc(result) return result
def find_pageno_cands(self, filtered=True): """ Find the set of all numbers on this page that don't e.g. follow 'chapter', then find the largest increasing subset of these numbers; we consider it likely that these'll be book page numbers """ # XXX i18n someday labels = ('chapter', 'part', 'section', 'book') if 'bounds' in self.page.info: bounds = self.page.info['bounds'] else: bounds = self.page.find_text_bounds() xcutoff = bounds.l + (bounds.r - bounds.l) * PAGENO_X_RATIO def get_page_cands(page): # XXX mini stack to handle last-3 words ? lastword = '' for i, word in enumerate(self.wordtuples): box = word.box word = word.text if box.l < xcutoff or lastword in labels: lastword = word continue r = rnum_to_int(word) if r != 0: yield pagenocand('0roman', r, i, self.page.index) for w in ( word, # lastword + word ): # for w in (word, lastword + word): # XXX should add on-same-line check to above # XXX this doesn't work currently, as it doesn't # get reflected in output! if w.isdigit(): val = int(w) if val < 999: # XXX replace above check with book page count # if avail. yield pagenocand('1arabic', int(w), i, self.page.index) lastword = word def printword(c): ar, val, loc = c print "%s %s - %s" % (ar, val, self.words[loc]) page_cands = [c for c in get_page_cands(self.page)] result = page_cands if filtered and len(page_cands) > 0: extracted_orig = extract_sorted(page_cands) result = extracted_orig if filtered and less_greedy_heuristic: if len(page_cands) != 0: slop = 1 if (len(extracted_orig) > 3 and extracted_orig[0].val + 2 >= extracted_orig[1].val and extracted_orig[1].val + 2 >= extracted_orig[2].val): page_cands_filtered = [] for c in page_cands: if c != extracted_orig[1] and c != extracted_orig[2]: page_cands_filtered.append(c) extracted_filtered = extract_sorted(page_cands_filtered) # did we just lose a pair we wanted? # see if extracted_filtered is the same, but for the missing two unchanged_sort = False if (len(extracted_filtered) + 2 == len(extracted_orig) and extracted_filtered[0] == extracted_orig[0]): i = 1 while i < len(extracted_orig) - 2: if extracted_filtered[i] != extracted_orig[i + 2]: break i += 1 unchanged_sort = True if (unchanged_sort or len(extracted_orig) + 4 < len(extracted_filtered)): # too much loss, go with orig result = extracted_orig else: result = extracted_filtered # print extracted_filtered # result = extracted_filtered # print extracted_filtered return result