예제 #1
0
        def get_page_cands(page):
            # XXX mini stack to handle last-3 words ?
            lastword = ''

            for i, word in enumerate(self.wordtuples):
                box = word.box
                word = word.text
                if box.l < xcutoff or lastword in labels:
                    lastword = word
                    continue
                r = rnum_to_int(word)
                if r != 0:
                    yield pagenocand('0roman', r, i, self.page.index)
                for w in (word, # lastword + word
                          ):
                # for w in (word, lastword + word):
                    # XXX should add on-same-line check to above
                    # XXX this doesn't work currently, as it doesn't
                    # get reflected in output!
                    if w.isdigit():
                        val = int(w)
                        if val < 999:
                            # XXX replace above check with book page count
                            # if avail.
                            yield pagenocand('1arabic', int(w), i, self.page.index)
                lastword = word
예제 #2
0
        def get_page_cands(page):
            # XXX mini stack to handle last-3 words ?
            lastword = ''

            for i, word in enumerate(self.wordtuples):
                box = word.box
                word = word.text
                if box.l < xcutoff or lastword in labels:
                    lastword = word
                    continue
                r = rnum_to_int(word)
                if r != 0:
                    yield pagenocand('0roman', r, i, self.page.index)
                for w in (
                        word,  # lastword + word
                ):
                    # for w in (word, lastword + word):
                    # XXX should add on-same-line check to above
                    # XXX this doesn't work currently, as it doesn't
                    # get reflected in output!
                    if w.isdigit():
                        val = int(w)
                        if val < 999:
                            # XXX replace above check with book page count
                            # if avail.
                            yield pagenocand('1arabic', int(w), i,
                                             self.page.index)
                lastword = word
예제 #3
0
 def set_base_score(self):
     if self.page.info['type'] == 'contents':
         self.score += 5
     labelcount = 1
     for i, w in enumerate(self.words):
         if w in labels:
             if i + 1 < len(self.words):
                 n = self.words[i + 1]
                 r = rnum_to_int(n)
                 if r != 0 or n.isdigit():
                     labelcount += 1
     if labelcount > 2:
         self.score += 5
예제 #4
0
 def set_base_score(self):
     if self.page.info['type'] == 'contents':
         self.score += 5
     labelcount = 1
     for i, w in enumerate(self.words):
         if w in labels:
             if i + 1 < len(self.words):
                 n = self.words[i + 1]
                 r = rnum_to_int(n)
                 if r != 0 or n.isdigit():
                     labelcount += 1
     if labelcount > 2:
         self.score += 5
예제 #5
0
def guess_label(words):
    def cleanword(text):
        text = re.sub(r'[\s.:,\(\)\/;!\'\"\-]', '', text)
        text.strip()
        return text.lower()
    labelwords = []
    if len(words) > 1:
        w = cleanword(words[0])
        if w in labels:
            labelwords.append(words.pop(0))
            w = cleanword(words[0])
            if len(words) > 1 and (w.isdigit()
                                   or rnum_to_int(w) != 0
                                   or w in numbers):
                labelwords.append(words.pop(0))
        else:
            r = rnum_to_int(w)
            if r != 0:
                labelwords.append(words.pop(0))
    if len(labelwords) > 0:
        # strip trailing ':', etc, as they're now semantically redundant
        lastword = labelwords[-1]
        labelwords[-1] = re.sub(r'[.:\-,]*$', '', labelwords[-1])
    return labelwords, words
예제 #6
0
def guess_label(words):
    def cleanword(text):
        text = re.sub(r'[\s.:,\(\)\/;!\'\"\-]', '', text)
        text.strip()
        return text.lower()

    labelwords = []
    if len(words) > 1:
        w = cleanword(words[0])
        if w in labels:
            labelwords.append(words.pop(0))
            w = cleanword(words[0])
            if len(words) > 1 and (w.isdigit() or rnum_to_int(w) != 0
                                   or w in numbers):
                labelwords.append(words.pop(0))
        else:
            r = rnum_to_int(w)
            if r != 0:
                labelwords.append(words.pop(0))
    if len(labelwords) > 0:
        # strip trailing ':', etc, as they're now semantically redundant
        lastword = labelwords[-1]
        labelwords[-1] = re.sub(r'[.:\-,]*$', '', labelwords[-1])
    return labelwords, words
예제 #7
0
def check_toc(toc_result):
    # reject if:
    # - any title is too long.
    # - page numbers aren't monotonic
    # - some entries have pageno w/o title/label
    # - if toc pages skip
    # - if a toc item contains numbers?

    toc = toc_result['qdtoc']

    if len(toc) < 4:
        failit(toc_result, 'failed due to too short')
        return toc_result['isok']
    prevno = 0
    prevtocpage = toc[0]['tocpage']
    for ti in toc:
        titlewords = ti['title'].split()
        for i in range(len(titlewords)):
            titlewords[i] = titlewords[i].lower()
            if i > 4:
                if titlewords[i].isdigit():
                    failit(
                        toc_result, 'suspected pagenum %s in title %s' %
                        (titlewords[i], ti['title']))
        for label in labels:
            if label in titlewords:
                failit(
                    toc_result, 'failed due to label %s seen in "%s' %
                    (label, ti['title']))
        if len(ti['title']) > 80:
            failit(toc_result, 'failed due to too long title')
        if len(ti['title'].strip()) == 0 and len(ti['label'].strip()) == 0:
            failit(toc_result, 'failed due to empty title + label')
        rval = rnum_to_int(ti['pagenum'])
        if rval is 0:
            if int(ti['pagenum']) < prevno:
                failit(toc_result, 'non-monotonic pages in toc')
            prevno = int(ti['pagenum'])
        if ti['tocpage'] > prevtocpage + 1:
            failit(toc_result, 'skipped pages in toc')
        prevtocpage = ti['tocpage']
    return toc_result['isok']
예제 #8
0
def check_toc(toc_result):
    # reject if:
    # - any title is too long.
    # - page numbers aren't monotonic
    # - some entries have pageno w/o title/label
    # - if toc pages skip
    # - if a toc item contains numbers?

    toc = toc_result['qdtoc']

    if len(toc) < 4:
        failit(toc_result, 'failed due to too short')
        return toc_result['isok']
    prevno = 0
    prevtocpage = toc[0]['tocpage']
    for ti in toc:
        titlewords = ti['title'].split()
        for i in range(len(titlewords)):
            titlewords[i] = titlewords[i].lower()
            if i > 4:
                if titlewords[i].isdigit():
                    failit(toc_result, 'suspected pagenum %s in title %s' %
                           (titlewords[i], ti['title']))
        for label in labels:
            if label in titlewords:
                failit(toc_result, 'failed due to label %s seen in "%s'
                       % (label, ti['title']))
        if len(ti['title']) > 80:
            failit(toc_result, 'failed due to too long title')
        if len(ti['title'].strip()) == 0 and len(ti['label'].strip()) == 0:
            failit(toc_result, 'failed due to empty title + label')
        rval = rnum_to_int(ti['pagenum'])
        if rval is 0:
            if int(ti['pagenum']) < prevno:
                failit(toc_result, 'non-monotonic pages in toc')
            prevno = int(ti['pagenum'])
        if ti['tocpage'] > prevtocpage + 1:
            failit(toc_result, 'skipped pages in toc')
        prevtocpage = ti['tocpage']
    return toc_result['isok']
예제 #9
0
    def add_match(self, page, match):
        # l('ADDING ' + str(match))
        info = RangeMatch(self, page, match)
        # l(info)
        pageno = page.info['number']
        pagenoval = rnum_to_int(pageno)
        if pagenoval == 0 and len(pageno) > 0:
            pagenoval = int(pageno)

        matchint = Interval.between(match.b, match.b + match.size)

        overlaps = [m for m in self.matches
                    if m & matchint]

        # if nearnos matches either, mark flag and amp score
        if pageno:
            nearnos = self.find_nearnos(match)
            # l("GREPME near is [%s] pagenoval %s" % (nearnos, pagenoval))
            # for no in nearnos[1], nearnos[0]:
            if nearnos is None: # XXX SHOULDN"T BE NEEDED!!!!!!!!!!!!
                nearnos = []
            for no in nearnos[1], nearnos[0]:
            # for no in nearnos:
                if no is not None:
                    # l(no.val)
                    if no.val == pagenoval:
                        info.notes += 'nearno: %s' % pageno
                        # l("GOODMATCH tc %s, %s %s" % (self.page.index, pageno, self.score))
                        self.score += 1
                        info.nearno = no.word_index
                        break
                    if no.val > pagenoval - 10 and match.a < 10:
                        self.score += .01
                        break

        # cases: no overlap
        if len(overlaps) == 0:
            self.matchinfo[matchint] = info
            self.matches = self.matches + IntervalSet([matchint])
        else:
            start = match.b
            end = match.b + match.size
            for i in overlaps:
                oinfo = self.matchinfo[i]
                ostart = oinfo.match.b
                oend = oinfo.match.b + oinfo.match.size
                scootback = 0
                if ostart < start:
                    scootback = start - ostart
                    start = ostart
                if oend > end:
                    end = oend
                info.match = Match(info.match.a - scootback, start, end - start)
                if oinfo.nearno != -1:
                    # assert(info.nearno == -1)
                    info.nearno = oinfo.nearno
                # info.score += oinfo.score
                # info.pageno = oinfo.pageno
                # info.notes = info.notes + ' ' + info.notes
                # for opageno in oinfo.pagenos:
                #     opagecount = oinfo.pagenos[opageno]
                #     if opageno in info.pagenos:
                #         info.pagenos[opageno] += opagecount
                #     else:
                #         info.pagenos[opageno] = opagecount
            self.matches += IntervalSet([matchint])
            (new_i,) = [m for m in self.matches if m & matchint]
            self.matchinfo[new_i] = info
예제 #10
0
    def add_match(self, page, match):
        # l('ADDING ' + str(match))
        info = RangeMatch(self, page, match)
        # l(info)
        pageno = page.info['number']
        pagenoval = rnum_to_int(pageno)
        if pagenoval == 0 and len(pageno) > 0:
            pagenoval = int(pageno)

        matchint = Interval.between(match.b, match.b + match.size)

        overlaps = [m for m in self.matches if m & matchint]

        # if nearnos matches either, mark flag and amp score
        if pageno:
            nearnos = self.find_nearnos(match)
            # l("GREPME near is [%s] pagenoval %s" % (nearnos, pagenoval))
            # for no in nearnos[1], nearnos[0]:
            if nearnos is None:  # XXX SHOULDN"T BE NEEDED!!!!!!!!!!!!
                nearnos = []
            for no in nearnos[1], nearnos[0]:
                # for no in nearnos:
                if no is not None:
                    # l(no.val)
                    if no.val == pagenoval:
                        info.notes += 'nearno: %s' % pageno
                        # l("GOODMATCH tc %s, %s %s" % (self.page.index, pageno, self.score))
                        self.score += 1
                        info.nearno = no.word_index
                        break
                    if no.val > pagenoval - 10 and match.a < 10:
                        self.score += .01
                        break

        # cases: no overlap
        if len(overlaps) == 0:
            self.matchinfo[matchint] = info
            self.matches = self.matches + IntervalSet([matchint])
        else:
            start = match.b
            end = match.b + match.size
            for i in overlaps:
                oinfo = self.matchinfo[i]
                ostart = oinfo.match.b
                oend = oinfo.match.b + oinfo.match.size
                scootback = 0
                if ostart < start:
                    scootback = start - ostart
                    start = ostart
                if oend > end:
                    end = oend
                info.match = Match(info.match.a - scootback, start,
                                   end - start)
                if oinfo.nearno != -1:
                    # assert(info.nearno == -1)
                    info.nearno = oinfo.nearno
                # info.score += oinfo.score
                # info.pageno = oinfo.pageno
                # info.notes = info.notes + ' ' + info.notes
                # for opageno in oinfo.pagenos:
                #     opagecount = oinfo.pagenos[opageno]
                #     if opageno in info.pagenos:
                #         info.pagenos[opageno] += opagecount
                #     else:
                #         info.pagenos[opageno] = opagecount
            self.matches += IntervalSet([matchint])
            (new_i, ) = [m for m in self.matches if m & matchint]
            self.matchinfo[new_i] = info
예제 #11
0
def pageno_candidates(pageinfo, page, index):
    seen = {}

    # find margin % of top/bottom of text bounding box
    pagebounds = pageinfo.info["bounds"]
    page_height = int(page.get("height"))
    margin = 0.05
    top_margin = pagebounds.t + page_height * margin
    bottom_margin = pagebounds.b - page_height * margin

    # findexpr = './/'+ns+'formatting'
    # for fmt in page.findall(findexpr):

    # # move on if not near page top/bottom
    # line = fmt.getparent()
    # t = int(line.get('t'))
    # b = int(line.get('b'))

    # if t > top_margin and t < bottom_margin:
    #     continue

    # fmt_text = etree.tostring(fmt,
    #                           method='text',
    #                           encoding=unicode).lower();
    for word in pageinfo.get_words():
        fmt_text = word.text

        # def find_box(m):
        #     # l t r b
        #     start, end = m.span()
        #     if end >= len(fmt):
        #         end = len(fmt) - 1
        #     return Coord(fmt[start].get('l'), t, fmt[end].get('r'), b)
        def find_box(m):
            raise "NYI"
            # return box(1,2,3,4)

        # look for roman numerals
        # fix some common OCR errors
        # XXX RESTORE adjusted_text = (fmt_text.replace('u', 'ii')
        #                  .replace('n', 'ii')
        #                  .replace('l', 'i')
        #                  .replace(r"\'", 'v'))
        adjusted_text = fmt_text

        # collapse space between potential roman numerals
        # XXX RESTORE adjusted_text = re.sub(r'\b([xvi]+)\b +\b([xvi]+)\b', r'\1\1', adjusted_text)
        for m in re_roman.finditer(adjusted_text):
            num_str = m.group()
            if not num_str in seen:

                i = rnum_to_int(num_str)
                if i > index and i != 0:
                    continue
                seen[num_str] = Pageno(
                    "roman",
                    num_str,
                    i,
                    index - i,
                    # [(word, find_box(m))])
                    [(word, None)],
                )
                # [(fmt, find_box(m))])
            else:
                seen[num_str].coords.append((word, None))
                # seen[num_str].coords.append((word, find_box(m)))
                # seen[num_str].coords.append((fmt, find_box(m)))
            yield seen[num_str]

        # look for arabic numerals
        # fix some common OCR errors
        # XXX RESTORE adjusted_text = fmt_text.replace('i', '1').replace('o', '0').replace('s', '5').replace('"', '11')
        # collapse spaces
        # XXX RESTORE adjusted_text = re.sub(r'\b(\d+)\b +\b(\d+)\b', r'\1\1', adjusted_text)
        for m in re_arabic.finditer(adjusted_text):
            num_str = m.group()
            if not num_str in seen:
                i = int(num_str)
                if i > index and i != 0:
                    continue
                seen[num_str] = Pageno("arabic", num_str, i, index - i, [(word, None)])
                # [(word, find_box(m))])
                # [(fmt, find_box(m))])
            else:
                seen[num_str].coords.append((word, None))
                # seen[num_str].coords.append((word, find_box(m)))
                # seen[num_str].coords.append((fmt, find_box(m)))
            yield seen[num_str]
예제 #12
0
def pageno_candidates(pageinfo, page, index):
    seen = {}

    # find margin % of top/bottom of text bounding box
    pagebounds = pageinfo.info['bounds']
    page_height = int(page.get('height'))
    margin = .05
    top_margin = pagebounds.t + page_height * margin
    bottom_margin = pagebounds.b - page_height * margin



    # findexpr = './/'+ns+'formatting'
    # for fmt in page.findall(findexpr):

        # # move on if not near page top/bottom
        # line = fmt.getparent()
        # t = int(line.get('t'))
        # b = int(line.get('b'))

        # if t > top_margin and t < bottom_margin:
        #     continue

        # fmt_text = etree.tostring(fmt,
        #                           method='text',
        #                           encoding=unicode).lower();
    for word in pageinfo.get_words():
        fmt_text = word.text

        # def find_box(m):
        #     # l t r b
        #     start, end = m.span()
        #     if end >= len(fmt):
        #         end = len(fmt) - 1
        #     return Coord(fmt[start].get('l'), t, fmt[end].get('r'), b)
        def find_box(m):
            raise 'NYI'
            # return box(1,2,3,4)

        # look for roman numerals
        # fix some common OCR errors
        # XXX RESTORE adjusted_text = (fmt_text.replace('u', 'ii')
        #                  .replace('n', 'ii')
        #                  .replace('l', 'i')
        #                  .replace(r"\'", 'v'))
        adjusted_text = fmt_text

        # collapse space between potential roman numerals
        # XXX RESTORE adjusted_text = re.sub(r'\b([xvi]+)\b +\b([xvi]+)\b', r'\1\1', adjusted_text)
        for m in re_roman.finditer(adjusted_text):
            num_str = m.group()
            if not num_str in seen:

                i = rnum_to_int(num_str)
                if i > index and i != 0:
                    continue
                seen[num_str] = Pageno('roman', num_str, i, index - i,
                                       # [(word, find_box(m))])
                                       [(word, None)])
                                       # [(fmt, find_box(m))])
            else:
                seen[num_str].coords.append((word, None))
                # seen[num_str].coords.append((word, find_box(m)))
                # seen[num_str].coords.append((fmt, find_box(m)))
            yield seen[num_str]

        # look for arabic numerals
        # fix some common OCR errors
        # XXX RESTORE adjusted_text = fmt_text.replace('i', '1').replace('o', '0').replace('s', '5').replace('"', '11')
        # collapse spaces
        # XXX RESTORE adjusted_text = re.sub(r'\b(\d+)\b +\b(\d+)\b', r'\1\1', adjusted_text)
        for m in re_arabic.finditer(adjusted_text):
            num_str = m.group()
            if not num_str in seen:
                i = int(num_str)
                if i > index and i != 0:
                    continue
                seen[num_str] = Pageno('arabic', num_str, i, index - i,
                                       [(word, None)])
                                       # [(word, find_box(m))])
                                       # [(fmt, find_box(m))])
            else:
                seen[num_str].coords.append((word, None))
                # seen[num_str].coords.append((word, find_box(m)))
                # seen[num_str].coords.append((fmt, find_box(m)))
            yield seen[num_str]