Python tokenExtract 예제들, pdf2epub.common.tokenExtract Python 예제들

예제 #1

0

파일 보기

파일: paragraphGrouping.py 프로젝트: JFTavares/pdf2epub

    def fixStartChars(self, paragraph, num):
        lines = paragraph.getchildren()

        # the first 'non large' word on the page
        # has the x value which is the same start
        # value as the lines affected by the large start letter
        first = lines[0]
        tokens = first.getchildren()
        if (num >= len(tokens)):
            return False
        first_normal = tokens[num]
        first_info = common.tokenExtract(first_normal)
        first_x = first_info['x']

        # figure out indent by looking for the first line
        # with a different indent
        indent = None
        for i in xrange(1, len(lines)):
            line_info = common.lineExtract(lines[i])
            if (line_info['left'] != first_x):
                indent = line_info['left']
                break

        # if we found an indent fix line indents
        if (indent != None):
            lines[0].set('left', unicode(indent))
            for i in xrange(1, len(lines)):
                line_info = common.lineExtract(lines[i])
                if (line_info['left'] == first_x):
                    lines[i].set('left', unicode(indent))
            return True
        else:
            return False

예제 #2

0

파일 보기

    def apply(self, page, c):
        output = common.copyElementAttributes(page)
        line_info = None
        for text in page.iter('TEXT'):
            for token in text.iter('TOKEN'):
                token_info = common.tokenExtract(token)

                if (line_info == None):
                    line_info = token_info
                    line = etree.Element('LINE')

                if (common.looseCompare(token_info['y'], line_info['y'],
                                        c['y_height_diff'])):
                    # same line
                    line.append(token)
                else:
                    # new line
                    output.append(line)

                    # reset
                    line = etree.Element('LINE')
                    line.append(token)
                    line_info = token_info

        # handle last line
        if (line_info != None):
            output.append(line)

        for line in output.iter('LINE'):
            self.lineSummary(line)

        return output

예제 #3

0

파일 보기

파일: lineExtractors.py 프로젝트: KasaiDot/pdf2epub

    def apply(self,page,c):
        output = common.copyElementAttributes(page) 
        line_info = None
        for text in page.iter('TEXT'):
            for token in text.iter('TOKEN'):
                token_info = common.tokenExtract(token)

                if (line_info == None):
                    line_info = token_info
                    line = etree.Element('LINE')
                
                if (common.looseCompare(token_info['y'],line_info['y'],c['y_height_diff'])):
                    # same line
                    line.append(token)
                else:
                    # new line
                    output.append(line)

                    # reset
                    line = etree.Element('LINE')
                    line.append(token)
                    line_info = token_info

        # handle last line
        if (line_info != None):
            output.append(line)

        for line in output.iter('LINE'):
            self.lineSummary(line)

        return output

예제 #4

0

파일 보기

파일: paragraphGrouping.py 프로젝트: KasaiDot/pdf2epub

    def fixStartChars(self, paragraph, num):
        lines  = paragraph.getchildren()

        # the first 'non large' word on the page
        # has the x value which is the same start
        # value as the lines affected by the large start letter
        first  = lines[0]
        tokens = first.getchildren()
        if (num >= len(tokens)):
            return False
        first_normal = tokens[num]        
        first_info   = common.tokenExtract(first_normal)
        first_x      = first_info['x']

        # figure out indent by looking for the first line
        # with a different indent
        indent = None
        for i in xrange(1, len(lines)):
            line_info = common.lineExtract(lines[i])
            if (line_info['left'] != first_x):
                indent = line_info['left']
                break

        # if we found an indent fix line indents
        if (indent != None):
            lines[0].set('left', unicode(indent))
            for i in xrange(1, len(lines)):
                line_info = common.lineExtract(lines[i])
                if (line_info['left'] == first_x):
                    lines[i].set('left', unicode(indent))
            return True
        else:
            return False

예제 #5

0

파일 보기

파일: paragraphGrouping.py 프로젝트: JFTavares/pdf2epub

 def numberOfStartChars(self, paragraph):
     lines = paragraph.getchildren()
     count = 0
     if (len(lines) > 1):
         for token in lines[0].iter('TOKEN'):
             info = common.tokenExtract(token)
             if (info['chars'] == 1):
                 # single character
                 count = count + 1
             else:
                 break
     return count

예제 #6

0

파일 보기

파일: fontExtractors.py 프로젝트: KasaiDot/pdf2epub

 def apply(self, token, c):
     info = common.tokenExtract(token)
     sz   = info['font-size']
     s    = "unknown"
     if (sz < self.smallest):
         s = 'xx-small'
     elif (sz > self.largest):
         s = 'xx-large'
     else:
         v = bisect.bisect_right(self.sizes, sz)
         s = self.table[self.sizes[v]]
     token.set('size', unicode(s))

예제 #7

0

파일 보기

파일: paragraphGrouping.py 프로젝트: KasaiDot/pdf2epub

 def numberOfStartChars(self, paragraph):
     lines = paragraph.getchildren()
     count = 0
     if (len(lines) > 1):
         for token in lines[0].iter('TOKEN'):
             info = common.tokenExtract(token)
             if (info['chars'] == 1):
                 # single character
                 count = count + 1
             else:
                 break
     return count

예제 #8

0

파일 보기

 def apply(self, token, c):
     info = common.tokenExtract(token)
     sz = info['font-size']
     s = "unknown"
     if (sz < self.smallest):
         s = 'xx-small'
     elif (sz > self.largest):
         s = 'xx-large'
     else:
         v = bisect.bisect_right(self.sizes, sz)
         s = self.table[self.sizes[v]]
     token.set('size', unicode(s))

예제 #9

0

파일 보기

    def lineSummary(self, line):
        base = []
        top = []
        height = []
        left = None
        right = None
        chars = 0

        for token in line.iter('TOKEN'):
            info = common.tokenExtract(token)
            base.append(info['base'])
            top.append(info['top'])
            height.append(info['height'])

            if (left == None):
                left = info['left']
            else:
                if (info['left'] < left):
                    left = info['left']

            if (right == None):
                right = info['right']
            else:
                if (info['right'] > right):
                    right = info['right']

            chars = chars + info['chars']

        # apply summary
        if (len(base) <= 2):
            line.set('base', unicode(common.largest(base)))
        else:
            line.set('base', unicode(common.mostCommon(base)))

        if (len(top) <= 2):
            line.set('top', unicode(common.smallest(top)))
        else:
            line.set('top', unicode(common.mostCommon(top)))

        line.set('left', unicode(left))
        line.set('right', unicode(right))

        if (len(height) <= 2):
            line.set('height', unicode(common.largest(height)))
        else:
            line.set('height', unicode(common.mostCommon(height)))
        line.set('chars', unicode(chars))

예제 #10

0

파일 보기

파일: lineExtractors.py 프로젝트: KasaiDot/pdf2epub

    def lineSummary(self, line):
        base   = []
        top    = []
        height = []
        left  = None
        right = None 
        chars = 0

        for token in line.iter('TOKEN'):
            info = common.tokenExtract(token)
            base.append(info['base'])
            top.append(info['top'])
            height.append(info['height'])
            
            if (left == None):
                left = info['left']
            else:
                if (info['left'] < left):
                    left = info['left']

            if (right == None):
                right = info['right']
            else:
                if (info['right'] > right):
                    right = info['right']

            chars = chars + info['chars']

        # apply summary
        if (len(base) <= 2):
            line.set('base', unicode(common.largest(base)))
        else:
            line.set('base', unicode(common.mostCommon(base)))

        if (len(top) <= 2):
            line.set('top', unicode(common.smallest(top)))
        else:
            line.set('top',    unicode(common.mostCommon(top)))

        line.set('left',   unicode(left))
        line.set('right',  unicode(right))
        
        if (len(height) <= 2):
            line.set('height', unicode(common.largest(height)))
        else:
            line.set('height', unicode(common.mostCommon(height)))
        line.set('chars',  unicode(chars))