def __init__(self, pdf, page_obj, page_number=None, initial_doctop=0, clean_unicode=True): self.pdf = pdf self.page_obj = page_obj self.page_number = page_number self.rotation = self.page_obj.attrs.get("Rotate", 0) % 360 self.page_obj.rotate = self.rotation self.initial_doctop = self.decimalize(initial_doctop) cropbox = page_obj.attrs.get("CropBox") mediabox = page_obj.attrs.get("MediaBox") self.cropbox = self.decimalize(resolve_all(cropbox)) if cropbox is not None else None self.mediabox = self.decimalize(resolve_all(mediabox) or self.cropbox) m = self.mediabox if self.rotation in [ 90, 270 ]: self.bbox = self.decimalize(( min(m[1], m[3]), min(m[0], m[2]), max(m[1], m[3]), max(m[0], m[2]), )) else: self.bbox = self.decimalize(( min(m[0], m[2]), min(m[1], m[3]), max(m[0], m[2]), max(m[1], m[3]), )) if clean_unicode: has_unicode = any(ord(c['text']) < 128 for c in self.chars) if has_unicode: self.chars = [i.update({'text':unicodedata.normalize(i['text'])}) for i in self.chars]
def __init__(self, pdf, page_obj, page_number=None, initial_doctop=0): self.pdf = pdf self.page_obj = page_obj self.page_number = page_number self.rotation = self.page_obj.attrs.get("Rotate", 0) % 360 self.page_obj.rotate = self.rotation self.initial_doctop = self.decimalize(initial_doctop) cropbox = page_obj.attrs.get("CropBox") mediabox = page_obj.attrs.get("MediaBox") self.cropbox = self.decimalize(resolve_all(cropbox)) if cropbox is not None else None self.mediabox = self.decimalize(resolve_all(mediabox) or self.cropbox) m = self.mediabox if self.rotation in [ 90, 270 ]: self.bbox = self.decimalize(( min(m[1], m[3]), min(m[0], m[2]), max(m[1], m[3]), max(m[0], m[2]), )) else: self.bbox = self.decimalize(( min(m[0], m[2]), min(m[1], m[3]), max(m[0], m[2]), max(m[1], m[3]), ))
def _decimalize(v, q=None): # If already a decimal, just return itself if type(v) == Decimal: return v # If tuple/list passed, bulk-convert elif isinstance(v, (tuple, list)): return type(v)(decimalize(x, q) for x in v) # If PDFObjRef passed, resolve it elif isinstance(v, PDFObjRef): return decimalize(resolve_all(v), q) # Convert int-like elif isinstance(v, numbers.Integral): return Decimal(int(v)) # Convert float-like elif isinstance(v, numbers.Real): if q != None: return Decimal(repr(v)).quantize(Decimal(repr(q)), rounding=ROUND_HALF_UP) else: return Decimal(repr(v)) else: raise ValueError("Cannot convert {0} to Decimal.".format(v))
def process_object(obj): attr = dict((k, CONVERSIONS[k](resolve_all(v))) for k, v in obj.__dict__.items() if k not in IGNORE) kind = re.sub(lt_pat, "", obj.__class__.__name__).lower() attr["object_type"] = kind attr["page_number"] = pno if hasattr(obj, "get_text"): attr["text"] = obj.get_text() if kind == "curve": attr["points"] = list(map(point2coord, obj.pts)) if attr.get("y0") != None: attr["top"] = h - attr["y1"] attr["bottom"] = h - attr["y0"] attr["doctop"] = idc + attr["top"] if objects.get(kind) == None: objects[kind] = [] objects[kind].append(attr) if hasattr(obj, "_objs"): for child in obj._objs: process_object(child)
def _parse_info(doc): xref = doc.xrefs[0] info_ref=xref.trailer.get('Info') if info_ref: info=resolve_all(info_ref) return info