示例#1
0
    def apply(self, page, c):
        output = common.copyElementAttributes(page)
        line_info = None
        for text in page.iter('TEXT'):
            for token in text.iter('TOKEN'):
                token_info = common.tokenExtract(token)

                if (line_info == None):
                    line_info = token_info
                    line = etree.Element('LINE')

                if (common.looseCompare(token_info['y'], line_info['y'],
                                        c['y_height_diff'])):
                    # same line
                    line.append(token)
                else:
                    # new line
                    output.append(line)

                    # reset
                    line = etree.Element('LINE')
                    line.append(token)
                    line_info = token_info

        # handle last line
        if (line_info != None):
            output.append(line)

        for line in output.iter('LINE'):
            self.lineSummary(line)

        return output
示例#2
0
    def apply(self, page, c):
        output = common.copyElementAttributes(page)
        paragraph = etree.Element('PARAGRAPH')
        lines = page.getchildren()

        if (len(lines) > 1):
            paragraph.append(lines[0])

        for i in xrange(0, len(lines) - 1):
            current = lines[i]
            nxt = lines[i + 1]
            c_info = common.lineExtract(current)
            n_info = common.lineExtract(nxt)
            if (common.looseCompare(c_info['height'], n_info['height'],
                                    c['line_height_diff'])):
                # same para
                paragraph.append(nxt)
            else:
                # new para
                output.append(paragraph)
                paragraph = etree.Element('PARAGRAPH')
                paragraph.append(nxt)

        # add final paragraph
        output.append(paragraph)

        return output
示例#3
0
    def apply(self,page,c):
        output = common.copyElementAttributes(page) 
        line_info = None
        for text in page.iter('TEXT'):
            for token in text.iter('TOKEN'):
                token_info = common.tokenExtract(token)

                if (line_info == None):
                    line_info = token_info
                    line = etree.Element('LINE')
                
                if (common.looseCompare(token_info['y'],line_info['y'],c['y_height_diff'])):
                    # same line
                    line.append(token)
                else:
                    # new line
                    output.append(line)

                    # reset
                    line = etree.Element('LINE')
                    line.append(token)
                    line_info = token_info

        # handle last line
        if (line_info != None):
            output.append(line)

        for line in output.iter('LINE'):
            self.lineSummary(line)

        return output
示例#4
0
    def apply(self, page, c):
        output = common.copyElementAttributes(page)
        paragraph = etree.Element('PARAGRAPH')
        lines  = page.getchildren() 

        if (len(lines) > 1):
            paragraph.append(lines[0])

        for i in xrange(0, len(lines) - 1):
            current = lines[i]
            nxt     = lines[i + 1]
            c_info  = common.lineExtract(current)
            n_info  = common.lineExtract(nxt)
            if (common.looseCompare(c_info['height'],n_info['height'],c['line_height_diff'])):
                # same para
                paragraph.append(nxt)
            else:
                # new para
                output.append(paragraph)
                paragraph = etree.Element('PARAGRAPH')
                paragraph.append(nxt)

        # add final paragraph
        output.append(paragraph)
        
        return output
示例#5
0
    def apply(self, page, c):
        output = common.copyElementAttributes(page)
        for paragraph in page.iter('PARAGRAPH'):
            paras = self.split(paragraph, c)
            for para in paras:
                output.append(para)

        return output
示例#6
0
 def apply(self, page, c):
     output = common.copyElementAttributes(page)
     for paragraph in page.iter('PARAGRAPH'):
         paras = self.split(paragraph,c )    
         for para in paras:
             output.append(para)
    
     return output