def parse_font_sizes(self, objs): ''' Function that takes a list of LTPages and returns the size of the individual characters on the page ''' alphanum_chars = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' ascii_chars = '''0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c''' #initialize the result arrays with a null value to avoid exceptions for the first character parsed char_properties_list = [{'characters':'', 'line':'', 'line_number':0, 'font_size':0}] line_counter = -1 # counter = 1 for obj in objs: if isinstance(obj, pdfminer.layout.LTTextBox): for o in obj._objs: if isinstance(o,pdfminer.layout.LTTextLine): # counter+=1 line_counter += 1 text=o.get_text() if text.strip(): for c in o._objs: char_properties = {'characters':None, 'line':None, 'line_number':None, 'font_size':None} #replace by blank space where the character is not ascii (these often have weirdly high font sizes) or an accented letter if isinstance(c, pdfminer.layout.LTChar) or isinstance(c, pdfminer.layout.LTAnno): char_properties['line'] = o.get_text() char_properties['line_number'] = line_counter #Replace by space if not ascii. if c.get_text() in ascii_chars: char_properties['characters'] = c.get_text() else: char_properties['characters'] = '' #Now for the exceptions. Different unicode characters that need to be handled specially. try: char_properties['characters'] = unaccent(c.get_text()) except Exception: pass #Set font sizes. We want to avoid the large font sizes of special characters. if c.get_text() in alphanum_chars: char_properties['font_size'] = round(c.size,1) # elif c.get_text() == '\n': # char_properties['font_size'] = 0 else: char_properties['font_size'] = char_properties_list[-1]['font_size'] # if counter<6: # print(c.get_text().encode('utf-8')) # print(char_properties['font_size']) char_properties_list.append(char_properties) # if it's a container, recurse elif isinstance(obj, pdfminer.layout.LTFigure): self.parse_font_sizes(obj._objs) #Return after removing the first cell added at initiation of list. return char_properties_list[1:]
def apply(self, track): format_str = self.__apply_attrib(track,self.current_format) format_str = format_str.replace(Format.escaped_symbol,Format.symbol) return unaccent(format_str)
def make_slug(preferred_name): return '-'.join( [ x for x in re.split('\W+', unaccent(unicode(preferred_name))) if len(x) > 0 ]).lower()