Пример #1
0
    def parse_font_sizes(self, objs):
        '''
        Function that takes a list of LTPages and returns the size of the
        individual characters on the page
        '''
        alphanum_chars = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
        ascii_chars = '''0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'''

        #initialize the result arrays with a null value to avoid exceptions for the first character parsed
        char_properties_list = [{'characters':'', 'line':'', 'line_number':0, 'font_size':0}]
        line_counter = -1
   #     counter = 1
        for obj in objs:
            if isinstance(obj, pdfminer.layout.LTTextBox):
                for o in obj._objs:
                    if isinstance(o,pdfminer.layout.LTTextLine):
   #                     counter+=1
                        line_counter += 1
                        text=o.get_text()
                        if text.strip():
                            for c in  o._objs:
                                char_properties = {'characters':None, 'line':None, 'line_number':None, 'font_size':None}
                                #replace by blank space where the character is not ascii (these often have weirdly high font sizes) or an accented letter
                                if isinstance(c, pdfminer.layout.LTChar) or isinstance(c, pdfminer.layout.LTAnno):
                                    char_properties['line'] = o.get_text()
                                    char_properties['line_number'] = line_counter

                                    #Replace by space if not ascii.
                                    if c.get_text() in ascii_chars:
                                        char_properties['characters'] = c.get_text()
                                    else:
                                        char_properties['characters'] = ''

                                    #Now for the exceptions. Different unicode characters that need to be handled specially.
                                    try:
                                        char_properties['characters'] = unaccent(c.get_text())
                                    except Exception:
                                        pass

                                    #Set font sizes. We want to avoid the large font sizes of special characters.
                                    if c.get_text() in alphanum_chars:
                                        char_properties['font_size'] = round(c.size,1)
                #                    elif c.get_text() == '\n':
                #                        char_properties['font_size'] = 0
                                    else:
                                        char_properties['font_size'] = char_properties_list[-1]['font_size']

        #                            if counter<6:
        #                                print(c.get_text().encode('utf-8'))
        #                                print(char_properties['font_size'])


                                char_properties_list.append(char_properties)
            # if it's a container, recurse
            elif isinstance(obj, pdfminer.layout.LTFigure):
                self.parse_font_sizes(obj._objs)

        #Return after removing the first cell added at initiation of list.
        return char_properties_list[1:]
Пример #2
0
	def apply(self, track):
		format_str = self.__apply_attrib(track,self.current_format)
		format_str = format_str.replace(Format.escaped_symbol,Format.symbol)
		return unaccent(format_str)
Пример #3
0
 def make_slug(preferred_name):
     return '-'.join(
         [ x for x in re.split('\W+', unaccent(unicode(preferred_name)))
           if len(x) > 0 ]).lower()