def get_metadata(self, tag=None): if self.__metadata == None: mpath = self.metadata_path() if os.path.exists(mpath): self.__metadata = read_metadata (self.metadata_path()) if tag: return self.__metadata and self.__metadata.get(tag) else: return self.__metadata or {}
def rip (self, location, doc_id): rootpath = os.path.join(location, "originals", "original.html") pagecontentspath = os.path.join(location, "webpagecontents.txt") md = read_metadata(os.path.join(location, "metadata.txt")) mimetype = md.get("apparent-mime-type") #note("location is %s, rootpath is %s, mimetype = %s", location, rootpath, mimetype) if (mimetype == "text/html") and os.path.exists(rootpath): # clean it pc = PageCleaner2(rootpath) text = pc.textify().strip() if text: fp = codecs.open(pagecontentspath, "w", "UTF-8") fp.write(text) fp.write("\n") fp.close()
def main (argv): if len(argv) < 1 or (not os.path.isdir(argv[0])): sys.stderr.write("Invalid directory specified.\n") sys.exit(1) set_verbosity(4) files = os.listdir(argv[0]) if ("docs" in files) and ("overhead" in files): from uplib.repository import Repository from uplib.plibUtil import configurator uplib_version = configurator().get("UPLIB_VERSION") r = Repository(uplib_version, argv[0], read_metadata(os.path.join(argv[0], "overhead", "metadata.txt"))) build_index_1_0(r)
def update_document_metadata_from_acm_diglib (location): def charref_replace(matchobj): return unichr(int(matchobj.group('charcode'))) def parse_endnote(newdict, md, endnote): parts = endnote.strip().split("\n") authors = "" for part in parts: p = ENDNOTE_CHARREF.sub(charref_replace, part.strip()) if p.startswith("%T "): newdict['title'] = p[3:].strip() newdict['title-is-original-filepath'] = '' elif p.startswith("%P "): newdict['page-numbers'] = p[3:].strip() elif p.startswith("%D "): # we override any existing date, because often the PDF file had # a bad date in it -- the date it was scanned to add to the library year, month, day = parse_date(p[3:].strip()) newdict['date'] = "%s/%s/%s" % (month, day, year) elif p.startswith("%A "): # ignore any author metadata in the PDF file if authors: authors += " and " authors += p[3:].strip() if authors: d['authors'] = authors mdpath = os.path.join(location, "metadata.txt") md = read_metadata(mdpath) if md.has_key("original-url") and "portal.acm.org" in md.get("original-url"): bibtex, endnote, abstract = fetch_bibtex_and_endnote_from_acm_diglib(md.get("original-url")) if bibtex or endnote: d = {} if bibtex: d['bibtex-citation'] = re.sub("\n", " ", bibtex) if endnote: parse_endnote(d, md, endnote) d['endnote-citation'] = re.sub("\n", " / ", endnote) if bibtex and not md.has_key("citation"): d["citation"] = re.sub("\n", " ", bibtex) if abstract and not md.has_key("abstract"): d["abstract"] = re.sub("\n|<par>|</par>", " ", abstract) update_metadata(mdpath, d) else: note("Couldn't fetch citation info for URL \"%s\".", md.get("original-url"))
def rip (self, folder, docid): def encodestring(s): # nytimes strings have xml char refs, and we want Unicode if not s: return s s = re.sub(r"&#([0-9]+);", lambda x: unichr(int(x.group(1))), s) s = re.sub(r"&([a-z]+);", lambda x: htmlentitydefs.name2codepoint(x.group(1)), s) return s mdpath = os.path.join(folder, "metadata.txt") originalspath = os.path.join(folder, "originals", "original.html") if not (os.path.exists(mdpath) and os.path.exists(originalspath)): return md = read_metadata(mdpath) url = md.get("original-url") if not url: return host, port, path = parse_URL(url) if host != "www.nytimes.com": return # OK, it's from the NY Times new_metadata = MetadataGatherer.parse(originalspath) if "source" not in md: md["source"] = "New York Times" # not all articles have metadata... if not ((('title' in new_metadata) or ('hdl' in new_metadata)) and ('pdate' in new_metadata)): note(3, "No metadata in article: %s", new_metadata) return md["title"] = encodestring(new_metadata.get("hdl") or md.get("title")) if "date" not in md: # get the date d = new_metadata.get("pdate") md["date"] = "%s/%s/%s" % (d[4:6], d[6:], d[:4]) if "authors" not in md: # get the byline d = new_metadata.get("byl") if d: if d.startswith("By "): d = d[3:] # capitalize properly d = d.title() # lowercase "And" d = d.replace(" And ", " and ") md["authors"] = encodestring(d) d = new_metadata.get("keywords") d0 = md.get("keywords") if d0: d0 += ("," + d) else: d0 = d if d0: md["keywords"] = encodestring(d0) if new_metadata.get("description"): md["summary"] = encodestring(new_metadata.get("description")) update_metadata(mdpath, md)
def thumbnail_translation_and_scaling (folder, d=None, update=true, recalc=false): # 'translation' is in units of points # 'scaling' is in units of pixels/point if d is None: d = dict() def find_data (key): return d.get(key) or doc_metadata.get(key) def parse_value (x): if x is None: return None elif type(x) in types.StringTypes: return eval('(' + x + ')') elif type(x) in types.TupleType: return x else: raise ValueError("argument " + str(x) + " must be string or tuple") metadata_file = os.path.join(folder, "metadata.txt") doc_metadata = read_metadata(metadata_file) if recalc: translation = None scaling = None else: translation = parse_value(doc_metadata.get("big-thumbnail-translation-points")) scaling = parse_value(doc_metadata.get("big-thumbnail-scaling-factor")) if scaling is None or translation is None: cropbox_data = find_data("cropping-bounding-box") images_size = eval('(%s)' % find_data("images-size")) if cropbox_data: cropbox = [eval('(%s)' % x) for x in cropbox_data.split(';')] else: cropbox = [(0,0), images_size] big_thumbnail_size = find_data("big-thumbnail-size") if big_thumbnail_size: big_tn_size = eval('(%s)' % big_thumbnail_size) else: from PIL import Image big_tn_size = Image.open(os.path.join(folder, "thumbnails", "big1.png")).size ppi = int(find_data("tiff-dpi") or find_data("images-dpi") or 300) # Remember that cropped page images have a 20 pixel border added back after scaling. # left_crop_border = 0 right_crop_border = 0 top_crop_border = 0 bottom_crop_border = 0 if cropbox_data: if cropbox[0][0] != 0: left_crop_border = 20 if cropbox[0][1] != 0: top_crop_border = 20 if cropbox[1][0] != images_size[0]: right_crop_border = 20 if cropbox[1][1] != images_size[1]: bottom_crop_border = 20 # calculate a translation quantity in "points" translation = (0 - float((cropbox[0][0] - left_crop_border) * 72)/ppi, 0 - float((cropbox[0][1] - top_crop_border) * 72)/ppi) # calculate a scaling factor that goes from bounding box edges in "points" to # scaled thumbnail coordinates in "pixels" # scaling = (float(ppi * big_tn_size[0])/float(72 * (cropbox[1][0] - cropbox[0][0] + (left_crop_border + right_crop_border))), float(ppi * big_tn_size[1])/float(72 * (cropbox[1][1] - cropbox[0][1] + (top_crop_border + bottom_crop_border)))) # now read the wordboxes and calculate the thumbnail bounding boxes for them note(4, " for %s: translation is %f, %f, scaling is %f, %f", folder, translation[0], translation[1], scaling[0], scaling[1]) if update: update_metadata(metadata_file, {'big-thumbnail-scaling-factor' : "%f,%f" % scaling, 'big-thumbnail-translation-points' : "%f,%f" % translation}) return translation, scaling
def do_thumbnails (dirpath, output_dir, **params): note(2, " thumbnailing in %s...", dirpath) tmpdir = tempfile.mktemp() retval = params.get('returnvalue', false) doc_metadata_path = os.path.join(dirpath, "metadata.txt") try: os.mkdir(tmpdir) os.chmod(tmpdir, 0700) try: md = read_metadata(doc_metadata_path) is_temporary_doc = md.get("temporary-contents") if is_temporary_doc and (is_temporary_doc == "true"): # temporary -- don't spend much time on this create_temporary_icons (md, dirpath, output_dir, params) retval = true return if os.path.exists(os.path.join(dirpath, "document.tiff")): # contains one single-page TIFF file tiffmaster = os.path.join(tmpdir, "master.tiff") split_command = (TIFF_SPLIT_CMD % (TIFFCP, os.path.join(dirpath, "document.tiff"), tiffmaster, TIFFSPLIT, tiffmaster, os.path.join(tmpdir, "x"))) status, output, tsignal = subproc(split_command) if status != 0: raise Error ("'%s' signals non-zero exit status %d in %s => %s" % (split_command, status, dirpath, tmpdir)) parts_dir = tmpdir filecheck_fn = lambda fn: fn[0] == "x" elif (os.path.exists(os.path.join(dirpath, "page-images")) and os.path.isdir(os.path.join(dirpath, "page-images"))): # contains directory full of PNG page images parts_dir = os.path.join(dirpath, "page-images") filecheck_fn = lambda fn: (fn.startswith('page') and fn.endswith('.png')) else: raise Error("No page images for document in %s" % dirpath) tiff_parts = os.listdir(parts_dir) if len(tiff_parts) < 1: raise Error("No pages in split tiff file directory after split!") # either a PNG-images or a TIFF split will sort properly in lexicographic order tiff_parts.sort() # see if there's a document icon legend and info about the DPI of the tiff file legend = md.get('document-icon-legend') tiff_dpi = int(md.get('images-dpi') or md.get('tiff-dpi') or params.get('images-dpi') or 0) page_numbers_v = md.get('page-numbers') page_numbers = (page_numbers_v and figure_page_numbers(page_numbers_v, dirpath)) first_page = int(md.get('first-page-number', 1)) skips = md.get('document-bbox-pages-to-skip', '') if skips: parts = string.split(skips, ':') bbox_skips = [] for part in parts: bbox_skips = bbox_skips + map(int, string.split(part, ',')) else: bbox_skips = None # figure bounding box for imaged page page_count = 0 bbox = None note(2, " calculating bounding box for large pages...") dont_crop = md.get('dont-crop-big-thumbnails', false) if AUTO_CROP_BIG_THUMBNAILS and not dont_crop: do_bbox = true else: do_bbox = false for tiff_part in tiff_parts: if not filecheck_fn(tiff_part): continue if page_count == 0: # find the width and height of the document docwidth, docheight = figure_doc_size(os.path.join(parts_dir, tiff_part)) if not do_bbox: bbox = (0, 0, docwidth, docheight) if do_bbox: bbox = figure_bbox (os.path.join(parts_dir, tiff_part), page_count, bbox, bbox_skips) if (bbox and bbox[0] == 0) and (bbox[1] == 0) and (bbox[2] >= docwidth) and (bbox[3] >= docheight): # don't bother, there's no area to crop already do_bbox = false page_count = page_count + 1 if page_count == 0: raise Error("No pages in split tiff file directory after split!") note(2, " final bbox is %s, page_count is %d", bbox, page_count) if USE_VIRTUAL_INK: note(2, " alpha channels will be added to large thumbnails...") # now make the thumbnails big_thumbnail_size = [] small_thumbnail_size = [] icon_size = [] page_index = 0 for tiff_part in tiff_parts: if not filecheck_fn(tiff_part): note(3, " skipping %s", tiff_part) continue tiff_path = os.path.join(parts_dir, tiff_part) if page_numbers: page_no_string = page_numbers.get(page_index) else: page_no_string = None note (2, " page %d%s", page_index, (page_no_string and " (%s)" % page_no_string) or "") try: if not create_thumbnail(tiff_path, tiff_dpi, output_dir, page_index, first_page, page_count, bbox, bbox_skips, big_thumbnail_size, small_thumbnail_size, icon_size, params.get('maxwidth'), params.get('maxheight'), params.get('maxscaling'), params.get('thumbnail_strategy'), legend, page_no_string): raise Error ("Can't create thumbnail for page %d in %s (of %s)" % (page_index, tiff_path, dirpath)) except Exception, x: doc_id = os.path.split(dirpath)[1] note("exception creating thumbnails for page %d of document %s:\n%s", page_index, doc_id, string.join(traceback.format_exception(*sys.exc_info()), "")) raise AbortDocumentIncorporation(doc_id, str(x)) if page_index == 0: bt_width = big_thumbnail_size[0] bt_height = big_thumbnail_size[1] st_width = small_thumbnail_size[0] st_height = small_thumbnail_size[1] else: bt_width = max(bt_width, big_thumbnail_size[0]) bt_height = max(bt_height, big_thumbnail_size[1]) st_width = max(st_width, small_thumbnail_size[0]) st_height = max(st_height, small_thumbnail_size[1]) st_scaling = (float(st_width)/float(docwidth) + float(st_height)/float(docheight)) / 2.0 page_index = page_index + 1 d = {"page-count" : str(page_count), "tiff-width" : str(docwidth), "images-width" : str(docwidth), "images-size" : "%d,%d" % (docwidth, docheight), "cropping-bounding-box" : "%d,%d;%d,%d" % (bbox), "big-thumbnail-size" : "%s,%s" % (bt_width, bt_height), "small-thumbnail-size" : "%s,%s" % (st_width, st_height), "small-thumbnail-scaling" : "%f" % st_scaling, "icon-size" : "%d,%d" % icon_size[0], "images-height" : str(docheight), "tiff-height" : str(docheight) } translation, scaling = thumbnail_translation_and_scaling(dirpath, d, false, true) d["big-thumbnail-translation-points"] = "%f,%f" % translation d["big-thumbnail-scaling-factor"] = "%f,%f" % scaling update_metadata(os.path.join(dirpath, "metadata.txt"), d) finally: shutil.rmtree(tmpdir) # indicate successful completion note(2, " finished.") retval = true
def rip(self, folder, docid): def encodestring(s): # WashPost strings have xml char refs, and we want Unicode if not s: return s s = re.sub(r"&#([0-9]+);", lambda x: unichr(int(x.group(1))), s) s = re.sub(r"&([a-z]+);", lambda x: htmlentitydefs.name2codepoint(x.group(1)), s) return s def dequote(s): return re.sub(r"\\'", "'", s) def catclean(s): return re.sub(r"[/,]", "_", s) mdpath = os.path.join(folder, "metadata.txt") originalspath = os.path.join(folder, "originals", "original.html") if not (os.path.exists(mdpath) and os.path.exists(originalspath)): return md = read_metadata(mdpath) url = md.get("original-url") if not url: return host, port, path = parse_URL(url) if host != "www.washingtonpost.com": return # OK, it's from the Post new_metadata = MetadataGatherer.parse(originalspath) for line in open(originalspath): if line.startswith(_HEADLINE): line = line[len(_HEADLINE) :].strip("\n") t = _TITLEPATTERN.match(line) if t: new_metadata["hdl"] = dequote(t.group("title")) m = _AUTHORSPATTERN.search(line) if m: new_metadata["authors"] = dequote(line[len(m.group(0)) :].strip(" ';\n")) if line.startswith(_CONTENTID): new_metadata["content-id"] = line[len(_CONTENTID) :].strip(" ';\n") if line.startswith(_SECTION): section = line[len(_SECTION) :].strip(" ';\n") i = section.index("'") new_metadata["section"] = section[:i] if "source" not in md: md["source"] = "Washington Post" # not all articles have metadata... if not ("hdl" in new_metadata): note(3, "No metadata in article: %s", new_metadata) return md["title"] = encodestring(new_metadata.get("hdl") or md.get("title")) if "date" not in md: # get the date d = _URLDATEPATTERN.match(url) if d: md["date"] = "%s/%s/%s" % (d.group("month"), d.group("day"), d.group("year")) if "authors" not in md: # get the byline d = new_metadata.get("authors") if d: md["authors"] = encodestring(d) d = new_metadata.get("keywords") d0 = md.get("keywords") if d and d0: d0 = [x.strip() for x in d0.split(",")] + [x.strip() for x in d.split(";")] elif d: d0 = [x.strip() for x in d.split(";")] if d0: md["keywords"] = encodestring(",".join(d0)) if new_metadata.get("description"): md["summary"] = encodestring(new_metadata.get("description")) md["abstract"] = encodestring(new_metadata.get("description")) section = new_metadata.get("section") if section: c = md.get("categories") if c: c = [x.strip() for x in c.split(",")] else: c = [] c = c + ["article", "Washington Post/%s" % catclean(section)] md["categories"] = ",".join(c) content_id = new_metadata.get("content-id") if content_id: md["citation"] = "Washington Post article %s" % content_id update_metadata(mdpath, md)
def do_HTML (dirpath, html_dir, doc_id, port): note(3, " HTMLing in %s...", dirpath) html_index = os.path.join(dirpath, "index.html") doc_id = os.path.basename(dirpath) retval = false try: if not os.path.exists(html_dir): os.mkdir(html_dir) os.chmod(html_dir, 0700) metadata = read_metadata(os.path.join(dirpath, "metadata.txt")) title = metadata.get('name') or metadata.get('title') or doc_id pagewidth = None pageheight = None bts = metadata.get('big-thumbnail-size') if bts: pagewidth, pageheight = [int(x) for x in string.split(bts, ',')] note(3, " title is %s, pagesize is %sx%s", title, pagewidth, pageheight) # start with summary.html note(3, " summary.html") summarypath = os.path.join(dirpath, "summary.txt") if os.path.exists(summarypath): f = open(summarypath, 'r') summary_text = f.read() f.close() html_summary = htmlescape(summary_text, true) else: html_summary = "" html_summary_path = os.path.join(html_dir, "summary.html") f = open(html_summary_path, 'w') f.write('<html><body>' + html_summary + '</body></html>'); f.close() os.chmod(html_summary_path, 0600) # next thumbs.html note(3, " thumbs.html") thumbs_path = os.path.join(html_dir, "thumbs.html") f = open(thumbs_path, "w") if USE_VIRTUAL_INK: bgcolor = "white" else: bgcolor = STANDARD_TOOLS_COLOR f.write('<html><body bgcolor="%s"><center>\n' % bgcolor) thumbnail_dir = os.path.join(dirpath, "thumbnails") thumbnail_files = os.listdir(thumbnail_dir) thumbs = [] for thumbnail in thumbnail_files: m = re.match(r"(\d+).png", thumbnail) if m: thumbs.append((int(m.group(1)), thumbnail,)) thumbs.sort() for thumbnail in thumbs: page_no = int(thumbnail[0]) f.write('<a href="page%s.html" target=viewarea>' % page_no) f.write('<img src="../thumbnails/%s" border=1></a><br>\n' % thumbnail[1]) # now write the HTML connected to that thumbnail page_html = os.path.join(html_dir, "page%s.html" % page_no) f2 = open (page_html, 'w') # get width of large page if not pagewidth or not pageheight: im = Image.open(os.path.join(thumbnail_dir, "big%s.png" % page_no)) pagewidth, pageheight = im.size[0] - 25, im.size[1] note(3, " title is %s, pagesize is %sx%s", title, pagewidth, pageheight) del im f2.write('<html><body bgcolor="white"><img src="../thumbnails/big%s.png" usemap="#page%smap" border=0>\n' % (page_no, page_no)) f2.write('<map name="page%smap">\n' % page_no) if (page_no < len(thumbs)): f2.write('<area href="page%s.html" alt="to Page %s" shape="circle" coords="%s,60,10">\n' % (page_no + 1, page_no + 1, pagewidth + 15)) f2.write('<area href="page%s.html" alt="to Page %s" shape="rect" coords="%s,0,%s,%s">\n' % (page_no + 1, page_no + 1, pagewidth/2, pagewidth, pageheight)) if (page_no > 1): f2.write('<area href="page%s.html" alt="to Page %s" shape="circle" coords="%s,90,10">\n' % (page_no - 1, page_no - 1, pagewidth + 15)) f2.write('<area href="page%s.html" alt="to Page %s" shape="rect" coords="0,0,%s,%s">\n' % (page_no - 1, page_no - 1, (pagewidth/2)-1, pageheight)) f2.write('<area href="/" alt="to repository" target="_top" shape="circle" coords="%s,207,10">\n' % (pagewidth + 15)) f2.write('</map></body></html>\n') f2.close() os.chmod(page_html, 0600) f.write('</center></body></html>') f.close() os.chmod (thumbs_path, 0600) # next is controls.html note(3, " controls.html") controls_path = os.path.join(html_dir, "controls.html") f = open(controls_path, "w") if CONTROLS_TEMPLATE: f.write(CONTROLS_TEMPLATE % { 'doc-id': doc_id }) else: f.write('<html>\n<head>\n') f.write('<script type="text/javascript">\n') f.write('function newInWindow(did, title, w, h, sidebar, twopage) {\n') f.write(' var s = "/action/basic/dv_show?doc_id=" + did + "&no-margin=1";\n') f.write(' var c = "width=" + w + ",height=" + h;\n') f.write(' if (!sidebar)\n') f.write(' s = s + "&no-sidebar=1";\n') f.write(' if (twopage)\n') f.write(' s = s + "&two-pages=1";\n') f.write(' defaultStatus = s;\n') f.write(' window.open(s, title, config=c);\n') f.write('}\n') f.write('</script></head><body bgcolor="%s">\n<center>\n' % STANDARD_TOOLS_COLOR) f.write("""<a href="javascript:newInWindow('%s','%s', %d+30, %d+10, false, false); void 0;">Detach</a>""" % (doc_id, htmlescape(title, true), pagewidth, pageheight)) f.write(""" <a href="javascript:newInWindow('%s','%s', (2 * %d)+30, %d+10, false, true); void 0;">(2)</a>\n""" % (doc_id, htmlescape(title, true), pagewidth, pageheight)) buttons = get_buttons_sorted(FN_DOCUMENT_SCOPE) for button in buttons: url = button[1][4] target = button[1][3] label = button[1][0] if url: f.write('<br>\n<a href="%s"' % htmlescape(url % doc_id, true)) else: f.write('<br>\n<a href="/action/basic/repo_userbutton?uplib_userbutton_key=%s&doc_id=%s"' % (button[0], doc_id)) if target: f.write(' target="%s"' % target) f.write('>%s</a>\n' % label) f.write("</center></body></html>") f.close() os.chmod(controls_path, 0600) # then index.html note(3, " index.html") f = open(html_index, "w") f.write('<head>\n') f.write('<title>%s</title>\n</head>\n' % htmlescape(title)) f.write('<base target="_top">' '<frameset cols="%s,*">' '<frameset rows="%s,*">' '<frame name=controls src="./html/controls.html">' '<frame name=thumbs src="./html/thumbs.html">' '</frameset>' '<frame name="viewarea" src="./html/page1.html">' '</frameset>\n' % (THUMBNAIL_COLWIDTH, CONTROLS_HEIGHT)) f.close() os.chmod(html_index, 0600) # indicate successful completion note(3, " finished.") retval = true except: info = sys.exc_info() note(0, "exception raised in createHTML:\n%s\n", string.join(traceback.format_exception(*info))) raise else: if not retval: note("bad retval %s", retval) if os.path.exists(html_index): os.unlink(html_index) if os.path.exists(html_dir): shutil.rmtree(html_dir)
def build_index_1_1 (repo): overhead_dir = repo.overhead_folder() index_file = os.path.join(overhead_dir, "index.upri") repo_mtime = repo.mod_time() note(3, "Considering rebuild of repository metadata index file...") if os.path.exists(index_file): # see if it's newer than the metadata.txt file mtime = os.path.getmtime(index_file) note(3, "repo mod time is %s, index file mod time is %s", repo_mtime, mtime) if mtime >= repo_mtime: note(3, "Index up-to-date.") return note("Re-building repository metadata index...") # need to rebuild index # some variables to keep track of categories and collections categories={} collections={} documents={} authors={} # read the repository metadata mdata = read_metadata(os.path.join(overhead_dir, "metadata.txt")) repo_password_hash = mdata.get('password') repo_password_hash = (repo_password_hash and binascii.a2b_hex(repo_password_hash)) or (20 * '\0') # List the number of documents def figure_author_name (basename): def clean_token(t): v = t.strip() if v[-1] == ",": v = v[:-1] return v honorifics = ("MD", "M.D.", "PhD", "Ph.D.", "Jr.", "Sr.", "II", "III", "IV", "V", "MPA") tokens = [clean_token(x) for x in basename.strip().split(' ') if x.strip()] if not tokens: note("Author name \"%s\" => %s", basename, tokens) return "" v = tokens[-1] h = "" while v in honorifics: h = h + ((h and " ") or "") + v tokens = tokens[:-1] v = tokens[-1] if len(tokens) > 2 and (tokens[-2] in ("van", "de", "von")): v = tokens[-2] + " " + v tokens = tokens[:-1] if tokens[:-1]: v = v + ", " + string.join(tokens[:-1]) if h: v = v + ", " + h return v def read_document (doc, categories, collections, authors): def figure_date(datestring): d2 = parse_date(datestring) if (not d2) or (sum(d2) == 0): return 0 return d2[0] * (13 * 32) + d2[1] * 13 + d2[2] docdata = {'id': doc.id, 'rloc': 0} mdata = doc.get_metadata() docdata['title'] = mdata.get('title', "") docdata['page-count'] = int(mdata.get('page-count', 1)) date = mdata.get('date') if date: docdata['date'] = figure_date(date); else: docdata['date'] = 0 docdata['addtime'] = int(id_to_time(doc.id)) # we don't really know the reftime (FIXME) but we'll use the document add time as an approximation docdata['reftime'] = docdata['addtime'] docdata['categories'] = [] cstring = mdata.get('categories', "") if cstring: for category in split_categories_string(cstring): if not category in categories: categories[category] = { 'rloc': 0, 'docs': [ doc.id, ], 'name': category } else: categories[category]['docs'].append(doc.id) docdata['categories'].append(category) docdata['authors'] = [] auths = mdata.get('authors', "").split(" and ") for auth in auths: if auth: authname = figure_author_name(auth) if not authname in authors: authors[authname] = { 'rloc': 0, 'docs': [ doc.id, ], 'name': authname } else: authors[authname]['docs'].append(doc.id) docdata['authors'].append(authname) return docdata for doc in repo.generate_docs(): documents[doc.id] = read_document(doc, categories, collections, authors) note(3, " processed documents...") # read the collections files for collname, coll in repo.list_collections(): collections[coll.id] = { 'name': collname, 'docs': [doc.id for doc in coll.docs()], 'query': (isinstance(coll, QueryCollection) and coll.query) or "", 'rloc': 0, 'presto': isinstance(coll, PrestoCollection), 'excludes': (isinstance(coll, PrestoCollection) and coll.excludes) or [], 'includes': (isinstance(coll, PrestoCollection) and coll.includes) or [], 'id': coll.id } note(3, " processed collections...") # now figure out the layout of the index file def sorted_values(d, rname): def compare(r1, r2): v1 = r1.get(rname) v2 = r2.get(rname) if (type(v1) in types.StringTypes) and (type(v2) in types.StringTypes): return cmp(v1.lower(), v2.lower()) else: return cmp(v1, v2) l = d.values() l.sort(compare) return l def document_record_size(r): return (2 + # offset to next document record 2 + # page count 2 + # number of categories 2 + # number of authors 4 + # date published 4 + # date last used 4 + # date added to repository 4 * len(r.get('authors')) + 4 * len(r.get('categories')) + 2 + (len(r.get("id").encode("UTF-8")) + 1) + # document ID 2 + (len(r.get("title").encode("UTF-8")) + 1)) def category_record_size(r): return (2 + # offset to next record 2 + # number of documents 4 * len(r.get("docs")) + # positions of document records 2 + (len(r.get("name").encode("UTF-8")) + 1)) # category name def author_record_size(r): return (2 + # offset to next record 2 + # number of documents 4 * len(r.get("docs")) + # positions of document records 2 + (len(r.get("name").encode("UTF-8")) + 1)) # author name def collection_record_size(r): return (2 + # offset to next record 2 + # number of documents 4 * len(r.get("docs")) + # positions of document records 2 + # number of explicitly included documents 2 + # number of explicitly excluded documents 4 * len(r.get("includes")) + # explicitly included 4 * len(r.get("excludes")) + # explicitly excluded 2 + (len(r.get("name").encode("UTF-8")) + 1) + # collection name 2 + (len(r.get("query").encode("UTF-8")) + 1)) # collection query def repository_record_size(r): return (4 + # number of docs in repository 4 + # number of authors 4 + # last-modified timem 2 + # number of categories 2 + # number of collections 4 + # first document record 4 + # first category record 4 + # first collection record 4 + # first authors record 20 + # SHA hash of password 2 + len(r.get("name", "").encode("UTF-8")) + 1) def round8 (v): return ((v + 7)/8)*8 mdata['rsize'] = repository_record_size(mdata) mdata['rloc'] = 32 first_doc_record = round8(mdata['rloc'] + mdata['rsize']) loc = first_doc_record for document in sorted_values(documents, 'id'): document['rsize'] = round8(document_record_size(document)) document['rloc'] = loc loc += document['rsize'] document['nextoffset'] = document['rsize'] first_categories_record = loc for category in sorted_values(categories, 'name'): category['rsize'] = round8(category_record_size(category)) category['rloc'] = loc loc += category['rsize'] category['nextoffset'] = category['rsize'] first_collections_record = loc for collection in sorted_values(collections, 'name'): collection['rsize'] = round8(collection_record_size(collection)) collection['rloc'] = loc loc += collection['rsize'] collection['nextoffset'] = collection['rsize'] first_author_record = loc for author in sorted_values(authors, 'name'): author['rsize'] = round8(author_record_size(author)) author['rloc'] = loc loc += author['rsize'] author['nextoffset'] = author['rsize'] note(3, " figured layout..."); # output data for debugging note(4, "repository name: %s", mdata.get("name")) note(3, "Documents (%d) at %s:", len(documents), first_doc_record) for document in documents.values(): note(4, " %s\n %s // %s pages // date %s // %s // %s", document['title'], document['authors'], document['page-count'], document['date'], document['id'], document['rloc']) note(3, "Categories (%d) at %s:", len(categories), first_categories_record) for category in categories.values(): note(4, " %s // %d docs // %s", category['name'], len(category['docs']), category['rloc']) note(3, "Collections (%d) at %s:", len(collections), first_collections_record) for collection in collections.values(): note(4, " %s // %s // %d docs // %s", collection['name'], collection['query'], len(collection['docs']), collection['rloc']) for author in authors.values(): note(4, " %s // %d docs // %s", author['name'], len(author['docs']), author['rloc']) for doc in author['docs']: r = documents.get(doc) note(4, " %s \"%s\"", r['id'], r['title']) note(3, "total size is %s", loc) # output the index file def out4(fp, v): fp.write(struct.pack(">I", v & 0xFFFFFFFF)) def out2(fp, v): fp.write(struct.pack(">H", v & 0xFFFF)) def outs(fp, v): s = (v and v.encode('UTF-8')) or "" fp.write(struct.pack(">H", (len(s) + 1) & 0xFFFF) + s + '\0') fp = open(index_file, "wb") try: # index version header magic = u"UpLib Repository Index 1.1".encode('US-ASCII') fp.write(magic + ('\0' * (32-len(magic)))) # write out repository information out4(fp, len(documents)) out4(fp, len(authors)) out4(fp, int(repo_mtime)) # seconds since 1/1/1970 out2(fp, len(categories)) out2(fp, len(collections)) out4(fp, first_doc_record) out4(fp, first_categories_record) out4(fp, first_collections_record) out4(fp, first_author_record) fp.write(repo_password_hash) outs(fp, mdata.get("name", "")) # for each document record, write that for document in sorted_values(documents, 'rloc'): note(4, "document %s at %s [%s]", document['id'], document['rloc'], document['rsize']) fp.seek(document['rloc']) out2(fp, document['nextoffset']) out2(fp, document['page-count']) out2(fp, len(document['categories'])) out2(fp, len(document['authors'])) out4(fp, document['date']) out4(fp, document['reftime']) out4(fp, document['addtime']) for a in document['authors']: r = authors.get(a) out4(fp, (r and r.get('rloc')) or 0) for c in document['categories']: r = categories.get(c) out4(fp, (r and r.get('rloc')) or 0) outs(fp, document['id']) outs(fp, document['title']) fp.flush() # write out categories for category in sorted_values(categories, 'rloc'): note(4, "category %s at %s [%s]", category['name'], category['rloc'], category['rsize']) fp.seek(category['rloc']) out2(fp, category['nextoffset']) out2(fp, len(category['docs'])) for docid in category['docs']: r = documents.get(docid) out4(fp, (r and r.get('rloc')) or 0) outs(fp, category['name']) fp.flush() # write out collections for collection in sorted_values(collections, 'rloc'): note(4, "collection %s at %s [%s] includes=%s excludes=%s", collection['name'], collection['rloc'], collection['rsize'], ((not collection['presto']) and 0xFFFF) or len(collection['includes']), ((not collection['presto']) and 0xFFFF) or len(collection['excludes'])) fp.seek(collection['rloc']) out2(fp, collection['nextoffset']) out2(fp, len(collection['docs'])) for docid in collection['docs']: r = documents.get(docid) out4(fp, (r and r.get('rloc')) or 0) includes = collection['includes'] excludes = collection['excludes'] out2(fp, ((not collection['presto']) and 0xFFFF) or len(includes)) out2(fp, ((not collection['presto']) and 0xFFFF) or len(excludes)) for docid in includes: r = documents.get(docid) out4(fp, (r and r.get('rloc')) or 0) for docid in excludes: r = documents.get(docid) out4(fp, (r and r.get('rloc')) or 0) outs(fp, collection['name']) outs(fp, collection['query']) fp.flush() # write out authors for author in sorted_values(authors, 'rloc'): note(4, "author %s at %s [%s]", author['name'], author['rloc'], author['rsize']) fp.seek(author['rloc']) out2(fp, author['nextoffset']) out2(fp, len(author['docs'])) for docid in author['docs']: r = documents.get(docid) out4(fp, (r and r.get('rloc')) or 0) outs(fp, author['name']) fp.flush() # finished fp.close() note(3, "wrote index at %s", os.path.getmtime(index_file)) except: excinfo = sys.exc_info() fp.close() os.unlink(index_file) note(0, "exception %s", traceback.format_exception(*excinfo))
def get_folder_metadata (self, location): return read_metadata(self.folder_metadata_path(location))
def _add_internal (ostream, percent_done_fn, repo, response, params, content, wait): # this can be called in several different ways. # In general, you post a multipart/form-data body which # contains a "contenttype" for the document, and either a "URL" # for the content, or a "content" parameter containing the # the actual content. If both "URL" and "content" are present, # the URL is added as the "original-url" value for the metadata, # and if the content is HTML, it's used as the "original.html" # and the URL is used to pull ancillary content referenced in it. content_type = params.get("contenttype") url = params.get("URL") noredir = params.get("no-redirect") noredir = noredir and (noredir.lower() == "true") uploadloc = url docname = params.get("documentname") tempf = None suppress_duplicates = params.get("suppress-duplicates") suppress_duplicates = suppress_duplicates and (suppress_duplicates.lower() == "true") bury = params.get("bury") bury = bury and (bury.lower() == "true") verbosity = int(params.get("verbosity") or "0") if content: if wait and ostream: _rewrite_job_output(ostream, '{ state: 0, msg: "Caching page..."}') extension = CONTENT_TYPES.get(content_type) if not extension: if wait: msg = "Don't know what to do with contenttype \"%s\"" % content_type if ostream: _rewrite_job_output(ostream, '{state: 1, msg: "' + urllib.quote(msg) + '"}') else: response.error(HTTPCodes.UNSUPPORTED_MEDIA_TYPE, msg) return # special case HTML/XHTML if content and (content_type.lower() in ("text/html", "application/xhtml+xml")): tempf = tempfile.mkdtemp() uploadloc = os.path.join(tempf, "original.html") # make sure that the folder for other parts exists, even if empty os.mkdir(os.path.join(tempf, "original_files")) # remove our bookmarklet, if present content = _BOOKMARKLET_PATTERN.sub('', content) content = _ADD_FORM_PATTERN.sub('', content) c = _OurCacher(url, filename=uploadloc, bits=content, content_type=content_type) # make sure that the folder for other parts exists, even if empty other_parts = os.path.join(tempf, "original_files") if not os.path.exists(other_parts): os.mkdir(other_parts) # special case 3x5 cards elif (docname and (content_type.lower() == "text/plain") and os.path.splitext(docname)[1] == ".3x5"): fd, tempf = tempfile.mkstemp(".3x5") fp = os.fdopen(fd, "wb") fp.write(content) fp.close() uploadloc = tempf else: fd, tempf = tempfile.mkstemp("." + extension) fp = os.fdopen(fd, "wb") fp.write(content) fp.close() uploadloc = tempf if suppress_duplicates: hash = calculate_originals_fingerprint(tempf) results = repo.do_query("sha-hash:"+hash) if results: # it's a duplicate doc = results[0][1] if os.path.isdir(tempf): shutil.rmtree(tempf) elif os.path.exists(tempf): os.remove(tempf) if ostream: _rewrite_job_output(ostream, '{ state: 2, doc_id: "' + doc.id + '"}') elif noredir: response.reply(doc.id, "text/plain") else: response.redirect("/action/basic/dv_show?doc_id=%s" % doc.id) return try: try: # get a cookie for authentication cookie = repo.new_cookie(url or content[:min(100, len(content))]) cookie_str = '%s=%s; path=/; Secure' % (cookie.name(), cookie.value()) os.environ["UPLIB_COOKIE"] = cookie_str doctitle = params.get("md-title") docauthors = params.get("md-authors") docdate = params.get("md-date") doccats = params.get("md-categories") metadata = params.get("metadata") if metadata: mdtmpfile = tempfile.mktemp() open(mdtmpfile, "w").write(metadata) # check to see if we're replacing an existing document md2 = read_metadata(StringIO.StringIO(metadata)) existing_doc_id = md2.get("replacement-contents-for") if existing_doc_id and not repo.valid_doc_id(existing_doc_id): raise ValueError("Invalid doc ID %s specified for replacement" % existing_doc_id) else: mdtmpfile = None existing_doc_id = None # now form the command scheme = ((repo.get_param("use-http", "false").lower() == "true") or _use_http) and "http" or "https" cmd = '%s --verbosity=%s --repository=%s://127.0.0.1:%s ' % (_uplib_add_document, verbosity, scheme, repo.port()) if doctitle: cmd += ' --title=%s' % pipes.quote(doctitle) if docauthors: cmd += ' --authors=%s' % pipes.quote(docauthors) if docdate: cmd += ' --date="%s"' % docdate if doccats: cmd += ' --categories=%s' % pipes.quote(doccats) if mdtmpfile: cmd += ' --metadata="%s"' % mdtmpfile cmd += ' "%s"' % uploadloc if ostream: _rewrite_job_output(ostream, '{state: 0, msg: "' + urllib.quote(cmd) + '"}') # and invoke the command status, output, tsignal = subproc(cmd) note(4, "cmd is %s, status is %s, output is %s", repr(cmd), status, repr(output.strip())) if mdtmpfile: os.unlink(mdtmpfile) if status == 0: # success; output should be doc-id doc_id = existing_doc_id or output.strip().split()[-1] note(4, "output is '%s'; doc_id for new doc is %s", output.strip(), doc_id) if wait and ostream: _rewrite_job_output(ostream, '{ state: 1, doc_id: "' + doc_id + '", msg: "' + urllib.quote(output) + '"}') # wait for it to come on-line if percent_done_fn: percent_done_fn(40) # estimate 40% of work done on client side while not repo.valid_doc_id(doc_id): if ostream: pending = repo.list_pending(full=True) s = _first(pending, lambda x: x['id'] == doc_id) if not s: break dstatus = s['status'] if dstatus == 'error': msg = 'server-side error incorporating document' _rewrite_job_output(ostream, '{ state: 3, doc_id: "' + doc_id + '", msg: "' + urllib.quote(s['error']) + '"}') break if dstatus == 'unpacking': msg = 'starting ripper process...' elif dstatus == 'ripping': msg = "ripping with ripper '" + s['ripper'] + "'..." elif dstatus == 'moving': msg = 'adding to registered document set...' _rewrite_job_output(ostream, '{ state: 1, doc_id: "' + doc_id + '", msg: "' + urllib.quote(msg) + '"}') time.sleep(1.0) if percent_done_fn: percent_done_fn(100) # finished if repo.valid_doc_id(doc_id): if bury: # wait up to 100 seconds for it to show up in history list # after that, wait another second, then bury it counter = 100 while counter > 0: h = [x.id for x in repo.history()] if doc_id in h: break counter -= 1 time.sleep(1) time.sleep(1) repo.touch_doc(doc_id, bury=True, notify=False) note(3, "buried %s", doc_id) if wait: if ostream: _rewrite_job_output(ostream, '{ state: 2, doc_id: "' + doc_id + '"}') elif noredir: response.reply(doc_id, "text/plain") else: response.redirect("/action/basic/dv_show?doc_id=%s" % doc_id) else: note("cmd <<%s>> failed with status %s:\n%s", cmd, status, output) if wait: if ostream: _rewrite_job_output(ostream, '{ state: 3, msg: "' + urllib.quote('Error processing the document:\n' + output) + '"}') else: response.error(HTTPCodes.INTERNAL_SERVER_ERROR, "<pre>" + htmlescape(output) + "</pre>") except: e = ''.join(traceback.format_exception(*sys.exc_info())) if wait: note(3, "Exception processing uplib-add-document request:\n%s", htmlescape(e)) if ostream: _rewrite_job_output(ostream, '{state: 3, msg: "' + urllib.quote("Exception processing uplib-add-document request:\n" + e) + '"}') else: response.error(HTTPCodes.INTERNAL_SERVER_ERROR, "Exception processing uplib-add-document request:\n<pre>" + htmlescape(e) + "\n</pre>") else: note("Exception processing uplib-add-document request:\n%s", e) finally: if tempf and os.path.isfile(tempf): os.unlink(tempf) elif tempf and os.path.isdir(tempf): shutil.rmtree(tempf)
def process_folder (repo, id, directory, delete_p, replace=None): def _protect_files (mode, dirname, files): for file in files: thepath = os.path.join(dirname, file) if os.path.isdir(thepath): os.chmod(thepath, 0700) else: os.chmod(thepath, 0600) note(2, "processing folder %s...", directory) description = None contents = None summary = None metadata = None wordbboxes = os.path.join(directory, "wordbboxes") tifffile = os.path.join(directory, "document.tiff") pageimagesdir = os.path.join(directory, "page-images") images = os.path.join(directory, "images") originals = os.path.join(directory, "originals") links = os.path.join(directory, "links") names = os.listdir(directory) for name in names: if string.lower(name) == "contents.txt": contents = os.path.join(directory, name) elif string.lower(name) == "summary.txt": summary = os.path.join(directory, name) elif string.lower(name) == "metadata.txt": metadata = os.path.join(directory, name) if replace is None: newdir = os.path.join(repo.pending_folder(), id) else: newdir = replace if not os.path.isdir(newdir): raise Error("Pending directory %s does not exist!" % newdir) try: lock_folder(newdir) try: if os.path.exists(images): destpath = os.path.join(newdir, "images") if replace and os.path.exists(destpath): shutil.rmtree(destpath) shutil.copytree (images, destpath) if delete_p: shutil.rmtree (images, true) if os.path.exists(originals): destpath = os.path.join(newdir, "originals") if replace and os.path.exists(destpath): shutil.rmtree(destpath) shutil.copytree (originals, destpath) if delete_p: shutil.rmtree (originals, true) if os.path.exists(links): destpath = os.path.join(newdir, "links") if replace and os.path.exists(destpath): shutil.rmtree(destpath) shutil.copytree (links, destpath) if delete_p: shutil.rmtree (links, true) if metadata: destpath = os.path.join(newdir, "metadata.txt") if replace and os.path.exists(destpath): os.unlink(destpath) shutil.copyfile(metadata, destpath) m = read_metadata(metadata) if m.has_key("title"): note("Title of uploaded folder is '%s'", m['title']) if delete_p: os.unlink(metadata) else: # create an empty metadata.txt destpath = os.path.join(newdir, "metadata.txt") if replace and os.path.exists(destpath): os.unlink(destpath) mdf = open(destpath, 'w') mdf.flush() mdf.close() newcontents = os.path.join(newdir, "contents.txt") if contents: if replace and os.path.exists(newcontents): os.unlink(newcontents) shutil.copyfile(contents, newcontents) if delete_p: os.unlink(contents) newsummary = os.path.join(newdir, "summary.txt") if summary: if replace and os.path.exists(newsummary): os.unlink(newsummary) shutil.copyfile(summary, newsummary) if delete_p: os.unlink(summary) if os.path.exists(wordbboxes): destpath = os.path.join(newdir, "wordbboxes") if replace and os.path.exists(destpath): os.unlink(destpath) shutil.copyfile(wordbboxes, destpath) if delete_p: os.unlink(wordbboxes) if os.path.exists(tifffile): destpath = os.path.join(newdir, "document.tiff") if replace and os.path.exists(destpath): os.unlink(destpath) shutil.copyfile(tifffile, destpath) if delete_p: os.unlink(tifffile) elif os.path.isdir(pageimagesdir): destpath = os.path.join(newdir, "page-images") if replace and os.path.exists(destpath): shutil.rmtree(destpath) shutil.copytree(pageimagesdir, destpath) if delete_p: shutil.rmtree(pageimagesdir, true) os.path.walk(newdir, _protect_files, None) os.chmod(newdir, 0700) return id finally: unlock_folder (newdir) except: type, value, tb = sys.exc_info() if os.path.exists(newdir) and not replace: shutil.rmtree(newdir) # re-raise the exception raise value, None, tb
def flesh_out_folder(id, tmpfilename, metadata, repo, unpack_fn, counter): try: try: # note(3, "CODETIMER_ON is %s", CODETIMER_ON) # if CODETIMER_ON: # code_timer.Init() # code_timer.CreateTable("uplib") # code_timer.CodeTimerOn() # code_timer.StartInt("newFolder$unpack", "uplib") # else: # code_timer.CodeTimerOff() if unpack_fn and tmpfilename and os.path.exists(tmpfilename): unpack_fn(repo, id, tmpfilename, metadata) # if CODETIMER_ON: # code_timer.StopInt("newFolder$unpack", "uplib") folderpath = repo.pending_location(id) try: note("unpacked new folder in %s", folderpath) if not sys.platform.lower().startswith("win"): s, o, t = subproc("ls -Rl %s" % folderpath) note("%s\n" % o) fp = open(os.path.join(folderpath, "UNPACKED"), 'w') fp.flush() fp.close() # as of this point, we can restart the inclusion of the document md = read_metadata(os.path.join(folderpath, "metadata.txt")) replacement_id = md.get("replacement-contents-for") if replacement_id: if repo.valid_doc_id(replacement_id): # contents to replace another document md["replacement-contents-for"] = "" update_metadata(os.path.join(folderpath, "metadata.txt"), md) note(2, "replacing contents of %s with this data...", replacement_id) existing_document = repo.get_document(replacement_id) new_folder = existing_document.folder() process_folder(repo, replacement_id, folderpath, false, new_folder) _run_rippers(new_folder, repo, replacement_id) existing_document.recache() repo.touch_doc(existing_document) raise AbortDocumentIncorporation(id, "replacement for existing document %s" % replacement_id) else: raise AbortDocumentIncorporation(id, "replacement for non-existent document %s" % replacement_id) _finish_inclusion (repo, folderpath, id) # if CODETIMER_ON: # noteOut = StringIO.StringIO() # noteOut.write("\nCode Timer statistics (what took time, in milliseconds):\n") # code_timer.PrintTable(noteOut, "uplib") # noteOut.write("\n") # noteOutString = noteOut.getvalue() # note(3, noteOutString) except: type, value, tb = sys.exc_info() note("%s", ''.join(traceback.format_exception(type, value, tb))) note_error(folderpath, (type, value, tb)) raise value, None, tb except AbortDocumentIncorporation, x: # ripper signalled to stop adopting this document, for good note(2, "AbortDocumentIncorporation exception on %s: %s", x.id, x.message) if (x.id == id): shutil.rmtree(folderpath) remove_from_index(repo.index_path(), id) except: type, value, tb = sys.exc_info() note("Exception processing new folder:\n%s", ''.join(traceback.format_exception(type, value, tb)))
def findimages(folder, debug=None): images = [] if not FINDIMAGES_PROGRAM: note(3, "FINDIMAGES_PROGRAM not defined") return images images_dir = os.path.join(folder, "page-images") if not os.path.isdir(images_dir): note(3, "No page images in %s", images_dir) return images md = read_metadata(os.path.join(folder, "metadata.txt")) dpi = int(md.get("images-dpi") or md.get("dpi") or md.get("tiff-dpi") or 300) scaling_factor = float(dpi)/72 def get_images_for_page (page_index, wordboxes, dpi, images_dir): pageimages = [] filepath = os.path.join(images_dir, "page%05d.png" % (page_index + 1)) if os.path.exists(filepath): wordboxes_file = tempfile.mktemp() try: boxlist = [] if wordboxes: # first, write out list of wordboxes, in Leptonica BOXA format for i in range(len(wordboxes)): box = boxes[i] x, y, w, h = (int(box.left() * dpi / 72.0), int(box.top() * dpi / 72.0), int(box.width() * dpi / 72.0), int(box.height() * dpi / 72.0)) if (w > 0) and (h > 0): boxlist.append((x, y, w, h)) if len(boxlist) > 0: fp = open(wordboxes_file, "wb") fp.write("\nBoxa Version 2\nNumber of boxes = %d\n" % len(boxlist)) for i in range(len(boxlist)): fp.write(" Box[%d]: " % i + "x = %d, y = %d, w = %d, h = %d\n" % boxlist[i]) fp.close() # now, run the finder on the page image plus the list of wordboxes debug_arg = (debug and "--debug") or " " cmd = "%s %s %s %s %s" % (FINDIMAGES_PROGRAM, debug_arg, dpi, filepath, (boxlist and wordboxes_file) or "-") note(4, "findimages cmd is <<%s>>", cmd) status, output, tsignal = subproc(cmd) if status == 0: for line in [x.strip() for x in output.split('\n') if x.strip()]: if not line.startswith("halftone "): continue pageimages.append((str(page_index) + " " + line.strip()).split()) else: note(3, "findimages command <%s> returns bad status %s:\n%s\n" % (cmd, status, output)) finally: # remove the temp file if os.path.exists(wordboxes_file): os.unlink(wordboxes_file) # note("%d: wordboxes file is %s", page_index, wordboxes_file) return pageimages if os.path.exists(os.path.join(folder, "wordbboxes")): for page_index, boxes in wordboxes_page_iterator(folder): images += get_images_for_page (page_index, boxes, dpi, images_dir) else: # handle case where there's no text for the image files = os.listdir(images_dir) for file in files: m = PAGE_IMAGE_FILENAME_PATTERN.match(file) if m: pageimages = get_images_for_page(int(m.group(1))-1, None, dpi, images_dir) images += pageimages point_squared = scaling_factor * scaling_factor images = [(pageno, imtype, x, y, width, height) for (pageno, imtype, x, y, width, height) in images if ((int(height) * int(width)) > point_squared)] note(3, "images for %s are %s", folder, images) return images