Пример #1
0
    def split(self, paragraph, c):
        lines = paragraph.getchildren()
        indent = self.estimateIndent(paragraph, c)
        justify = self.estimateJustify(paragraph, c)
        output = []
        tmp = etree.Element('PARAGRAPH')
        info = None
        for i in xrange(0, len(lines)):
            info = common.lineExtract(lines[i])
            if (common.looseCompare(info['left'], indent,
                                    c['para_indent_diff'])):
                # same para
                tmp.append(lines[i])
            elif (i == 0):
                # indented first line
                tmp.append(lines[i])
            else:
                # new para
                tmp.set('complete', 'yes')
                output.append(tmp)
                tmp = etree.Element('PARAGRAPH')
                tmp.append(lines[i])

        if (info != None):
            if (common.looseCompare(info['right'], justify,
                                    c['para_indent_diff'])):
                if (len(tmp) == 1):
                    tmp.set('complete', 'yes')
                else:
                    tmp.set('complete', 'no')
            else:
                tmp.set('complete', 'yes')

            output.append(tmp)
        return output
Пример #2
0
    def split(self, paragraph, c):
        lines = paragraph.getchildren()
        indent  = self.estimateIndent(paragraph, c)
        justify = self.estimateJustify(paragraph, c)
        output = []
        tmp = etree.Element('PARAGRAPH')
        info = None
        for i in xrange(0,len(lines)):
            info = common.lineExtract(lines[i])
            if (common.looseCompare(info['left'],indent,c['para_indent_diff'])):
                # same para
                tmp.append(lines[i])
            elif (i == 0):
                # indented first line 
                tmp.append(lines[i])
            else:
                # new para
                tmp.set('complete', 'yes')
                output.append(tmp)
                tmp = etree.Element('PARAGRAPH')
                tmp.append(lines[i])

        if (info != None):
            if (common.looseCompare(info['right'],justify,c['para_indent_diff'])):
                if (len(tmp) == 1):
                    tmp.set('complete', 'yes')
                else:
                    tmp.set('complete', 'no')
            else:
                tmp.set('complete', 'yes')

            output.append(tmp)
        return output
Пример #3
0
    def apply(self, page, c):
        output = common.copyElementAttributes(page)
        line_info = None
        for text in page.iter('TEXT'):
            for token in text.iter('TOKEN'):
                token_info = common.tokenExtract(token)

                if (line_info == None):
                    line_info = token_info
                    line = etree.Element('LINE')

                if (common.looseCompare(token_info['y'], line_info['y'],
                                        c['y_height_diff'])):
                    # same line
                    line.append(token)
                else:
                    # new line
                    output.append(line)

                    # reset
                    line = etree.Element('LINE')
                    line.append(token)
                    line_info = token_info

        # handle last line
        if (line_info != None):
            output.append(line)

        for line in output.iter('LINE'):
            self.lineSummary(line)

        return output
Пример #4
0
    def split(self, paragraph, c):
        output = []
        lines = paragraph.getchildren()
        common_diff = self.lineDiff(lines)

        tmp = etree.Element('PARAGRAPH')
        if (len(lines) > 0):
            tmp.append(lines[0])

        for i in xrange(0, len(lines) - 1):
            a = lines[i]
            b = lines[i + 1]

            a_values = common.lineExtract(a)
            b_values = common.lineExtract(b)

            diff = b_values['base'] - a_values['base']

            if (common.looseCompare(common_diff, diff, c['vertical_diff'])):
                # same
                tmp.append(b)
            else:
                # split
                output.append(tmp)
                tmp = etree.Element('PARAGRAPH')
                tmp.append(b)

        output.append(tmp)
        return output
Пример #5
0
    def apply(self, page, c):
        output = common.copyElementAttributes(page)
        paragraph = etree.Element('PARAGRAPH')
        lines = page.getchildren()

        if (len(lines) > 1):
            paragraph.append(lines[0])

        for i in xrange(0, len(lines) - 1):
            current = lines[i]
            nxt = lines[i + 1]
            c_info = common.lineExtract(current)
            n_info = common.lineExtract(nxt)
            if (common.looseCompare(c_info['height'], n_info['height'],
                                    c['line_height_diff'])):
                # same para
                paragraph.append(nxt)
            else:
                # new para
                output.append(paragraph)
                paragraph = etree.Element('PARAGRAPH')
                paragraph.append(nxt)

        # add final paragraph
        output.append(paragraph)

        return output
Пример #6
0
    def apply(self,page,c):
        output = common.copyElementAttributes(page) 
        line_info = None
        for text in page.iter('TEXT'):
            for token in text.iter('TOKEN'):
                token_info = common.tokenExtract(token)

                if (line_info == None):
                    line_info = token_info
                    line = etree.Element('LINE')
                
                if (common.looseCompare(token_info['y'],line_info['y'],c['y_height_diff'])):
                    # same line
                    line.append(token)
                else:
                    # new line
                    output.append(line)

                    # reset
                    line = etree.Element('LINE')
                    line.append(token)
                    line_info = token_info

        # handle last line
        if (line_info != None):
            output.append(line)

        for line in output.iter('LINE'):
            self.lineSummary(line)

        return output
Пример #7
0
    def apply(self, page, c):
        output = common.copyElementAttributes(page)
        paragraph = etree.Element('PARAGRAPH')
        lines  = page.getchildren() 

        if (len(lines) > 1):
            paragraph.append(lines[0])

        for i in xrange(0, len(lines) - 1):
            current = lines[i]
            nxt     = lines[i + 1]
            c_info  = common.lineExtract(current)
            n_info  = common.lineExtract(nxt)
            if (common.looseCompare(c_info['height'],n_info['height'],c['line_height_diff'])):
                # same para
                paragraph.append(nxt)
            else:
                # new para
                output.append(paragraph)
                paragraph = etree.Element('PARAGRAPH')
                paragraph.append(nxt)

        # add final paragraph
        output.append(paragraph)
        
        return output
Пример #8
0
    def split(self, paragraph, c):
        output = []    
        lines = paragraph.getchildren()
        common_diff  = self.lineDiff(lines)


        tmp = etree.Element('PARAGRAPH')
        if (len(lines) > 0):
            tmp.append(lines[0])

        for i in xrange(0,len(lines) - 1):
            a = lines[i]
            b = lines[i + 1]
            
            a_values = common.lineExtract(a)
            b_values = common.lineExtract(b)

            diff = b_values['base'] - a_values['base']

            if (common.looseCompare(common_diff,diff,c['vertical_diff'])):
                # same
                tmp.append(b)
            else:
                # split
                output.append(tmp)
                tmp = etree.Element('PARAGRAPH')
                tmp.append(b)

        output.append(tmp)
        return output