def get_page_cands(page): # XXX mini stack to handle last-3 words ? lastword = '' for i, word in enumerate(self.wordtuples): box = word.box word = word.text if box.l < xcutoff or lastword in labels: lastword = word continue r = rnum_to_int(word) if r != 0: yield pagenocand('0roman', r, i, self.page.index) for w in (word, # lastword + word ): # for w in (word, lastword + word): # XXX should add on-same-line check to above # XXX this doesn't work currently, as it doesn't # get reflected in output! if w.isdigit(): val = int(w) if val < 999: # XXX replace above check with book page count # if avail. yield pagenocand('1arabic', int(w), i, self.page.index) lastword = word
def get_page_cands(page): # XXX mini stack to handle last-3 words ? lastword = '' for i, word in enumerate(self.wordtuples): box = word.box word = word.text if box.l < xcutoff or lastword in labels: lastword = word continue r = rnum_to_int(word) if r != 0: yield pagenocand('0roman', r, i, self.page.index) for w in ( word, # lastword + word ): # for w in (word, lastword + word): # XXX should add on-same-line check to above # XXX this doesn't work currently, as it doesn't # get reflected in output! if w.isdigit(): val = int(w) if val < 999: # XXX replace above check with book page count # if avail. yield pagenocand('1arabic', int(w), i, self.page.index) lastword = word
def set_base_score(self): if self.page.info['type'] == 'contents': self.score += 5 labelcount = 1 for i, w in enumerate(self.words): if w in labels: if i + 1 < len(self.words): n = self.words[i + 1] r = rnum_to_int(n) if r != 0 or n.isdigit(): labelcount += 1 if labelcount > 2: self.score += 5
def guess_label(words): def cleanword(text): text = re.sub(r'[\s.:,\(\)\/;!\'\"\-]', '', text) text.strip() return text.lower() labelwords = [] if len(words) > 1: w = cleanword(words[0]) if w in labels: labelwords.append(words.pop(0)) w = cleanword(words[0]) if len(words) > 1 and (w.isdigit() or rnum_to_int(w) != 0 or w in numbers): labelwords.append(words.pop(0)) else: r = rnum_to_int(w) if r != 0: labelwords.append(words.pop(0)) if len(labelwords) > 0: # strip trailing ':', etc, as they're now semantically redundant lastword = labelwords[-1] labelwords[-1] = re.sub(r'[.:\-,]*$', '', labelwords[-1]) return labelwords, words
def check_toc(toc_result): # reject if: # - any title is too long. # - page numbers aren't monotonic # - some entries have pageno w/o title/label # - if toc pages skip # - if a toc item contains numbers? toc = toc_result['qdtoc'] if len(toc) < 4: failit(toc_result, 'failed due to too short') return toc_result['isok'] prevno = 0 prevtocpage = toc[0]['tocpage'] for ti in toc: titlewords = ti['title'].split() for i in range(len(titlewords)): titlewords[i] = titlewords[i].lower() if i > 4: if titlewords[i].isdigit(): failit( toc_result, 'suspected pagenum %s in title %s' % (titlewords[i], ti['title'])) for label in labels: if label in titlewords: failit( toc_result, 'failed due to label %s seen in "%s' % (label, ti['title'])) if len(ti['title']) > 80: failit(toc_result, 'failed due to too long title') if len(ti['title'].strip()) == 0 and len(ti['label'].strip()) == 0: failit(toc_result, 'failed due to empty title + label') rval = rnum_to_int(ti['pagenum']) if rval is 0: if int(ti['pagenum']) < prevno: failit(toc_result, 'non-monotonic pages in toc') prevno = int(ti['pagenum']) if ti['tocpage'] > prevtocpage + 1: failit(toc_result, 'skipped pages in toc') prevtocpage = ti['tocpage'] return toc_result['isok']
def check_toc(toc_result): # reject if: # - any title is too long. # - page numbers aren't monotonic # - some entries have pageno w/o title/label # - if toc pages skip # - if a toc item contains numbers? toc = toc_result['qdtoc'] if len(toc) < 4: failit(toc_result, 'failed due to too short') return toc_result['isok'] prevno = 0 prevtocpage = toc[0]['tocpage'] for ti in toc: titlewords = ti['title'].split() for i in range(len(titlewords)): titlewords[i] = titlewords[i].lower() if i > 4: if titlewords[i].isdigit(): failit(toc_result, 'suspected pagenum %s in title %s' % (titlewords[i], ti['title'])) for label in labels: if label in titlewords: failit(toc_result, 'failed due to label %s seen in "%s' % (label, ti['title'])) if len(ti['title']) > 80: failit(toc_result, 'failed due to too long title') if len(ti['title'].strip()) == 0 and len(ti['label'].strip()) == 0: failit(toc_result, 'failed due to empty title + label') rval = rnum_to_int(ti['pagenum']) if rval is 0: if int(ti['pagenum']) < prevno: failit(toc_result, 'non-monotonic pages in toc') prevno = int(ti['pagenum']) if ti['tocpage'] > prevtocpage + 1: failit(toc_result, 'skipped pages in toc') prevtocpage = ti['tocpage'] return toc_result['isok']
def add_match(self, page, match): # l('ADDING ' + str(match)) info = RangeMatch(self, page, match) # l(info) pageno = page.info['number'] pagenoval = rnum_to_int(pageno) if pagenoval == 0 and len(pageno) > 0: pagenoval = int(pageno) matchint = Interval.between(match.b, match.b + match.size) overlaps = [m for m in self.matches if m & matchint] # if nearnos matches either, mark flag and amp score if pageno: nearnos = self.find_nearnos(match) # l("GREPME near is [%s] pagenoval %s" % (nearnos, pagenoval)) # for no in nearnos[1], nearnos[0]: if nearnos is None: # XXX SHOULDN"T BE NEEDED!!!!!!!!!!!! nearnos = [] for no in nearnos[1], nearnos[0]: # for no in nearnos: if no is not None: # l(no.val) if no.val == pagenoval: info.notes += 'nearno: %s' % pageno # l("GOODMATCH tc %s, %s %s" % (self.page.index, pageno, self.score)) self.score += 1 info.nearno = no.word_index break if no.val > pagenoval - 10 and match.a < 10: self.score += .01 break # cases: no overlap if len(overlaps) == 0: self.matchinfo[matchint] = info self.matches = self.matches + IntervalSet([matchint]) else: start = match.b end = match.b + match.size for i in overlaps: oinfo = self.matchinfo[i] ostart = oinfo.match.b oend = oinfo.match.b + oinfo.match.size scootback = 0 if ostart < start: scootback = start - ostart start = ostart if oend > end: end = oend info.match = Match(info.match.a - scootback, start, end - start) if oinfo.nearno != -1: # assert(info.nearno == -1) info.nearno = oinfo.nearno # info.score += oinfo.score # info.pageno = oinfo.pageno # info.notes = info.notes + ' ' + info.notes # for opageno in oinfo.pagenos: # opagecount = oinfo.pagenos[opageno] # if opageno in info.pagenos: # info.pagenos[opageno] += opagecount # else: # info.pagenos[opageno] = opagecount self.matches += IntervalSet([matchint]) (new_i,) = [m for m in self.matches if m & matchint] self.matchinfo[new_i] = info
def add_match(self, page, match): # l('ADDING ' + str(match)) info = RangeMatch(self, page, match) # l(info) pageno = page.info['number'] pagenoval = rnum_to_int(pageno) if pagenoval == 0 and len(pageno) > 0: pagenoval = int(pageno) matchint = Interval.between(match.b, match.b + match.size) overlaps = [m for m in self.matches if m & matchint] # if nearnos matches either, mark flag and amp score if pageno: nearnos = self.find_nearnos(match) # l("GREPME near is [%s] pagenoval %s" % (nearnos, pagenoval)) # for no in nearnos[1], nearnos[0]: if nearnos is None: # XXX SHOULDN"T BE NEEDED!!!!!!!!!!!! nearnos = [] for no in nearnos[1], nearnos[0]: # for no in nearnos: if no is not None: # l(no.val) if no.val == pagenoval: info.notes += 'nearno: %s' % pageno # l("GOODMATCH tc %s, %s %s" % (self.page.index, pageno, self.score)) self.score += 1 info.nearno = no.word_index break if no.val > pagenoval - 10 and match.a < 10: self.score += .01 break # cases: no overlap if len(overlaps) == 0: self.matchinfo[matchint] = info self.matches = self.matches + IntervalSet([matchint]) else: start = match.b end = match.b + match.size for i in overlaps: oinfo = self.matchinfo[i] ostart = oinfo.match.b oend = oinfo.match.b + oinfo.match.size scootback = 0 if ostart < start: scootback = start - ostart start = ostart if oend > end: end = oend info.match = Match(info.match.a - scootback, start, end - start) if oinfo.nearno != -1: # assert(info.nearno == -1) info.nearno = oinfo.nearno # info.score += oinfo.score # info.pageno = oinfo.pageno # info.notes = info.notes + ' ' + info.notes # for opageno in oinfo.pagenos: # opagecount = oinfo.pagenos[opageno] # if opageno in info.pagenos: # info.pagenos[opageno] += opagecount # else: # info.pagenos[opageno] = opagecount self.matches += IntervalSet([matchint]) (new_i, ) = [m for m in self.matches if m & matchint] self.matchinfo[new_i] = info
def pageno_candidates(pageinfo, page, index): seen = {} # find margin % of top/bottom of text bounding box pagebounds = pageinfo.info["bounds"] page_height = int(page.get("height")) margin = 0.05 top_margin = pagebounds.t + page_height * margin bottom_margin = pagebounds.b - page_height * margin # findexpr = './/'+ns+'formatting' # for fmt in page.findall(findexpr): # # move on if not near page top/bottom # line = fmt.getparent() # t = int(line.get('t')) # b = int(line.get('b')) # if t > top_margin and t < bottom_margin: # continue # fmt_text = etree.tostring(fmt, # method='text', # encoding=unicode).lower(); for word in pageinfo.get_words(): fmt_text = word.text # def find_box(m): # # l t r b # start, end = m.span() # if end >= len(fmt): # end = len(fmt) - 1 # return Coord(fmt[start].get('l'), t, fmt[end].get('r'), b) def find_box(m): raise "NYI" # return box(1,2,3,4) # look for roman numerals # fix some common OCR errors # XXX RESTORE adjusted_text = (fmt_text.replace('u', 'ii') # .replace('n', 'ii') # .replace('l', 'i') # .replace(r"\'", 'v')) adjusted_text = fmt_text # collapse space between potential roman numerals # XXX RESTORE adjusted_text = re.sub(r'\b([xvi]+)\b +\b([xvi]+)\b', r'\1\1', adjusted_text) for m in re_roman.finditer(adjusted_text): num_str = m.group() if not num_str in seen: i = rnum_to_int(num_str) if i > index and i != 0: continue seen[num_str] = Pageno( "roman", num_str, i, index - i, # [(word, find_box(m))]) [(word, None)], ) # [(fmt, find_box(m))]) else: seen[num_str].coords.append((word, None)) # seen[num_str].coords.append((word, find_box(m))) # seen[num_str].coords.append((fmt, find_box(m))) yield seen[num_str] # look for arabic numerals # fix some common OCR errors # XXX RESTORE adjusted_text = fmt_text.replace('i', '1').replace('o', '0').replace('s', '5').replace('"', '11') # collapse spaces # XXX RESTORE adjusted_text = re.sub(r'\b(\d+)\b +\b(\d+)\b', r'\1\1', adjusted_text) for m in re_arabic.finditer(adjusted_text): num_str = m.group() if not num_str in seen: i = int(num_str) if i > index and i != 0: continue seen[num_str] = Pageno("arabic", num_str, i, index - i, [(word, None)]) # [(word, find_box(m))]) # [(fmt, find_box(m))]) else: seen[num_str].coords.append((word, None)) # seen[num_str].coords.append((word, find_box(m))) # seen[num_str].coords.append((fmt, find_box(m))) yield seen[num_str]
def pageno_candidates(pageinfo, page, index): seen = {} # find margin % of top/bottom of text bounding box pagebounds = pageinfo.info['bounds'] page_height = int(page.get('height')) margin = .05 top_margin = pagebounds.t + page_height * margin bottom_margin = pagebounds.b - page_height * margin # findexpr = './/'+ns+'formatting' # for fmt in page.findall(findexpr): # # move on if not near page top/bottom # line = fmt.getparent() # t = int(line.get('t')) # b = int(line.get('b')) # if t > top_margin and t < bottom_margin: # continue # fmt_text = etree.tostring(fmt, # method='text', # encoding=unicode).lower(); for word in pageinfo.get_words(): fmt_text = word.text # def find_box(m): # # l t r b # start, end = m.span() # if end >= len(fmt): # end = len(fmt) - 1 # return Coord(fmt[start].get('l'), t, fmt[end].get('r'), b) def find_box(m): raise 'NYI' # return box(1,2,3,4) # look for roman numerals # fix some common OCR errors # XXX RESTORE adjusted_text = (fmt_text.replace('u', 'ii') # .replace('n', 'ii') # .replace('l', 'i') # .replace(r"\'", 'v')) adjusted_text = fmt_text # collapse space between potential roman numerals # XXX RESTORE adjusted_text = re.sub(r'\b([xvi]+)\b +\b([xvi]+)\b', r'\1\1', adjusted_text) for m in re_roman.finditer(adjusted_text): num_str = m.group() if not num_str in seen: i = rnum_to_int(num_str) if i > index and i != 0: continue seen[num_str] = Pageno('roman', num_str, i, index - i, # [(word, find_box(m))]) [(word, None)]) # [(fmt, find_box(m))]) else: seen[num_str].coords.append((word, None)) # seen[num_str].coords.append((word, find_box(m))) # seen[num_str].coords.append((fmt, find_box(m))) yield seen[num_str] # look for arabic numerals # fix some common OCR errors # XXX RESTORE adjusted_text = fmt_text.replace('i', '1').replace('o', '0').replace('s', '5').replace('"', '11') # collapse spaces # XXX RESTORE adjusted_text = re.sub(r'\b(\d+)\b +\b(\d+)\b', r'\1\1', adjusted_text) for m in re_arabic.finditer(adjusted_text): num_str = m.group() if not num_str in seen: i = int(num_str) if i > index and i != 0: continue seen[num_str] = Pageno('arabic', num_str, i, index - i, [(word, None)]) # [(word, find_box(m))]) # [(fmt, find_box(m))]) else: seen[num_str].coords.append((word, None)) # seen[num_str].coords.append((word, find_box(m))) # seen[num_str].coords.append((fmt, find_box(m))) yield seen[num_str]