if (page == -1): print(("Error %d in %s() on page %d: %s\n" % \ (tet.get_errnum(), tet.get_apiname(), pageno, \ tet.get_errmsg()))) continue # try next page */ # Retrieve all text fragments for the page */ text = tet.get_text(page) while (text): # Loop over all characters */ ci = tet.get_char_info(page) while (ci): # We need only the font name and size the text # position could be fetched from ci->x and ci->y. #/ fontname = tet.pcos_get_string(doc, \ "fonts[%d]/name" % ci["fontid"]) # Check whether we found a match */ # C only: some versions of strstr don't allow empty # strings, so we better check */ if (ci["fontsize"] >= fontsizetrigger and \ fontname.find(fontnametrigger) != -1): # print the retrieved font name, size, and text */ print(("[%s %.2f] %s" % (fontname, ci["fontsize"], text))) ci = tet.get_char_info(page) # In this sample we check only the first character of # each fragment. #/ break text = tet.get_text(page) if (tet.get_errnum() != 0):
#print((" Tagged PDF: %s\n" % \ # yesno(tet.pcos_get_number(doc, "tagged")))) #print(("No. of pages: %d" % \ # int(tet.pcos_get_number(doc, "length:pages")))) #print((" Page 1 size: width=%g, height=%g" % \ # (tet.pcos_get_number(doc, "pages[%d]/width" % 0), # tet.pcos_get_number(doc, "pages[%d]/height" % 0)))) count = int(tet.pcos_get_number(doc, "length:fonts")) #print(("No. of fonts: %d" % count)) for i in range(count): type = tet.pcos_get_string(doc, "fonts[%d]/type" % i) name = tet.pcos_get_string(doc, "fonts[%d]/name" % i) # if (tet.pcos_get_number(doc, "fonts[%d]/embedded" % i)): # print("embedded %s font %s" % (type, name)) # else: # print("unembedded %s font %s" % (type, name)) #print() plainmetadata = \ int(tet.pcos_get_number(doc, "encrypt/plainmetadata")) if (pcosmode == 1 and not plainmetadata and \ int(tet.pcos_get_number(doc, "encrypt/nocopy"))): print("Restricted mode: no more information available\n")
ti = tet.get_image_info(page) while (ti): # Report image geometry print("page %d: %.2fx%.2fpt, alpha=%.2f, beta=%.2f" % (pageno, ti["width"], ti["height"], ti["alpha"], ti["beta"])) # Retrieve additional image properties with pCOS print(" id=%d, %dx%d pixel" % ( ti["imageid"], tet.pcos_get_number(doc, "images[%d]/Width" % ti["imageid"]), tet.pcos_get_number(doc, "images[%d]/Height" % ti["imageid"]))) cs = tet.pcos_get_number(doc, "images[%d]/colorspaceid" % ti["imageid"]) if (cs != -1): print(" %dx%d bit %s" % (tet.pcos_get_number(doc, "colorspaces[%d]/components" % cs), tet.pcos_get_number(doc, "images[%d]/bpc" % ti["imageid"]), tet.pcos_get_string(doc, "colorspaces[%d]/name" % cs ))) else: # cs==-1 may happen for some JPEG 2000 images. bpc, # colorspace name and number of components are not # available in this case. print("JPEG2000") # Fetch image data and write it to a disk file. The # output filename is generated from the input filename, # page number and image ID. imageoptlist = baseimageoptlist + " filename {" + outfilebase + "_p" + repr(pageno) + "_I" + repr(ti["imageid"]) + "}" if (tet.write_image_file(doc, ti["imageid"], imageoptlist) == -1):
width = tet.pcos_get_number(doc, \ "images[%d]/Width" % imageid) height = tet.pcos_get_number(doc, \ "images[%d]/Height" % imageid) bpc = tet.pcos_get_number(doc, \ "images[%d]/bpc" % imageid) cs = tet.pcos_get_number(doc, \ "images[%d]/colorspaceid" % imageid) txt = "image I%d: %dx%d pixel, " % (imageid, width, height); if (cs != -1): txt = txt + "%dx" % tet.pcos_get_number(doc, \ "colorspaces[%d]/components" % cs) \ + "%d bit " % bpc \ + "%s" % tet.pcos_get_string(doc, \ "colorspaces[%d]/name" % cs) else: # cs==-1 may happen for some JPEG 2000 images. bpc, # colorspace name and number of components are not # available in this case. txt = txt + "JPEG2000" if (mergetype): txt = txt + ", mergetype=" if (mergetype == 1): txt = txt + "artificial" else: txt = txt + "consumed" print(txt)