def create_temporary_icons (metadata, dirpath, output_dir, params): global TEMPORARY_BACKGROUND, UNDER_CONSTRUCTION thumbnails_path = output_dir os.mkdir(thumbnails_path) note("thumbnails_path is %s", thumbnails_path) title = metadata.get("title") document_icon = Image.new("RGB", (150, 194), TEMPORARY_BACKGROUND) draw = ImageDraw.Draw(document_icon) draw.line((0,0) + document_icon.size, LEGEND_COLOR) draw.line((0, document_icon.size[1], document_icon.size[0], 0), LEGEND_COLOR) draw.rectangle((0, 0, document_icon.size[0]-1, document_icon.size[1] -1), outline=LEGEND_COLOR) if title: document_icon = add_legend(document_icon, ("(255, 255, 255)" + title,)) document_icon.save(os.path.join(thumbnails_path, "first.png"), "PNG") page_1_big = Image.new("RGB", (425, 550), TEMPORARY_BACKGROUND) legend = [] legend.append("(255,255,255)[temporary document]") if title: legend.append("(0,255,0)%s" % title) page_1_big = add_legend(page_1_big, legend) page_1_big.save(os.path.join(thumbnails_path, "big1.png"), "PNG") page_1_small = Image.new("RGB", (85, 110), TEMPORARY_BACKGROUND) add_page_no (page_1_small, (5, 5), "1") page_1_small.save(os.path.join(thumbnails_path, "1.png"), "PNG") update_metadata(os.path.join(dirpath, "metadata.txt"), {"page-count" : "1", "tiff-width" : 2550, "images-width" : 2550, "images-size" : "2550,3300", "cropping-bounding-box" : "0,0;2550,3300", "big-thumbnail-size" : "425,550", "small-thumbnail-size" : "85,110", "small-thumbnail-scaling" : ("%f" % (float(1)/float(30))), "images-height" : "3300", "tiff-height" : "3300", })
def update_document_metadata_from_acm_diglib (location): def charref_replace(matchobj): return unichr(int(matchobj.group('charcode'))) def parse_endnote(newdict, md, endnote): parts = endnote.strip().split("\n") authors = "" for part in parts: p = ENDNOTE_CHARREF.sub(charref_replace, part.strip()) if p.startswith("%T "): newdict['title'] = p[3:].strip() newdict['title-is-original-filepath'] = '' elif p.startswith("%P "): newdict['page-numbers'] = p[3:].strip() elif p.startswith("%D "): # we override any existing date, because often the PDF file had # a bad date in it -- the date it was scanned to add to the library year, month, day = parse_date(p[3:].strip()) newdict['date'] = "%s/%s/%s" % (month, day, year) elif p.startswith("%A "): # ignore any author metadata in the PDF file if authors: authors += " and " authors += p[3:].strip() if authors: d['authors'] = authors mdpath = os.path.join(location, "metadata.txt") md = read_metadata(mdpath) if md.has_key("original-url") and "portal.acm.org" in md.get("original-url"): bibtex, endnote, abstract = fetch_bibtex_and_endnote_from_acm_diglib(md.get("original-url")) if bibtex or endnote: d = {} if bibtex: d['bibtex-citation'] = re.sub("\n", " ", bibtex) if endnote: parse_endnote(d, md, endnote) d['endnote-citation'] = re.sub("\n", " / ", endnote) if bibtex and not md.has_key("citation"): d["citation"] = re.sub("\n", " ", bibtex) if abstract and not md.has_key("abstract"): d["abstract"] = re.sub("\n|<par>|</par>", " ", abstract) update_metadata(mdpath, d) else: note("Couldn't fetch citation info for URL \"%s\".", md.get("original-url"))
def process_tarred_folder (repo, id, tarfile, metadata): # create a new folder, and populate it dirname = tempfile.mktemp() try: os.mkdir(dirname) os.chmod(dirname, 0700) cmd = UNTAR_CMD % (dirname, TAR, tarfile) note(2, "Untarring folder into temporary directory %s", dirname) status, output, signal = subproc(cmd) if status == 0: note(2, "Successfully untarred folder into %s", dirname) if metadata: update_metadata(os.path.join(dirname, "metadata.txt"), metadata) if (os.path.exists(os.path.join(dirname, "document.tiff")) or os.path.isdir(os.path.join(dirname, "page-images"))): return process_folder(repo, id, dirname, true) else: raise Error("invalid folder -- no page images file") else: raise Error("Problem untarring folder:\n%s" % output) finally: if os.path.exists(dirname): shutil.rmtree(dirname)
def process_zipped_folder (repo, id, zipfile, metadata): # create a new folder, and populate it dirname = tempfile.mktemp() try: os.mkdir(dirname) os.chmod(dirname, 0700) note(2, "Unzipping folder into temporary directory %s", dirname) try: unzip(dirname, zipfile) note(2, "Successfully unzipped folder into %s", dirname) if metadata: update_metadata(os.path.join(dirname, "metadata.txt"), metadata) except: typ, ex, tb = sys.exc_info() s = string.join(traceback.format_exception(typ, ex, tb)) raise Error("Problem unzipping folder:\n%s" % s) if (os.path.exists(os.path.join(dirname, "document.tiff")) or os.path.isdir(os.path.join(dirname, "page-images"))): return process_folder(repo, id, dirname, true) else: raise Error("invalid folder -- no page images") finally: if os.path.exists(dirname): shutil.rmtree(dirname)
def do_page_bounding_boxes (dirpath): textfilepath = os.path.join(dirpath, "contents.txt") wordbox_file = open(os.path.join(dirpath, "wordbboxes"), 'rb') pos_filepath = os.path.join(dirpath, "contents.ind") para_filepath = os.path.join(dirpath, "paragraphs.txt") note ("doing page bboxes for %s...", dirpath) if os.path.exists(pos_filepath): fp = open(pos_filepath, 'r') postags = POSTag.parse_parseinfo(fp) fp.close() else: postags = None bbox_iterator = wordboxes_page_iterator(dirpath) text_file = open(textfilepath, 'rb') firstline = text_file.readline() charsetmatch = CHARSETPATTERN.match(firstline) if charsetmatch: charsetname = charsetmatch.group(1) text_file.readline() first_byte = text_file.tell() else: charsetname = "latin_1" readlines = false first_byte = 0 if charsetname not in UTF8_ALIASES: raise ValueError("Charset in contents.txt must be UTF-8 for page bounding boxes to be created. Apparently it's %s, instead." % charsetname) text_file.seek(first_byte) paras = read_paragraphs_file(para_filepath) if paras: paras.sort(key=lambda x: x.first_byte) from createThumbnails import thumbnail_translation_and_scaling translation, scaling = thumbnail_translation_and_scaling (dirpath) note(4, " translation and scaling are %s and %s...", translation, scaling) def update_stats (stats, page_stats): if stats: stats += ", " stats += "%d:%.3f:%d:%d:%d:%d:%.3f" % (page_stats[0], ((page_stats[0] > 0) and float(page_stats[1])/float(page_stats[0]) or 0.0), page_stats[2], page_stats[3], page_stats[4], page_stats[5], ((page_stats[0] > 0) and float(page_stats[6])/float(page_stats[0]) or 0.0)) return stats page_index = 0 out_page_index = 0 last_cindex = 0 bboxes = [] postags_index = 0 stats = "" # accumulate stats doc_stats = [ 0, # number of words 0, # total length (in characters) 0, # number of bold words 0, # number of italic words 0, # number of bold-italic words 0, # number of fixed-width words 0.0, # total font sizes ] for page_index, bboxes in bbox_iterator: page_stats = [ 0, # number of words 0, # total length (in characters) 0, # number of bold words 0, # number of italic words 0, # number of bold-italic words 0, # number of fixed-width words 0.0, # total font sizes ] adjusted_bboxes = [] for bbox in bboxes: char_count = bbox.nchars() doc_stats[0] += 1 doc_stats[1] += bbox.nchars() if bbox.is_bold(): doc_stats[2] += 1 if bbox.is_italic(): doc_stats[3] += 1 if bbox.is_bold() and bbox.is_italic(): doc_stats[4] += 1 if bbox.is_fixedwidth(): doc_stats[5] += 1 doc_stats[6] += bbox.font_size() page_stats[0] += 1 page_stats[1] += bbox.nchars() if bbox.is_bold(): page_stats[2] += 1 if bbox.is_italic(): page_stats[3] += 1 if bbox.is_bold() and bbox.is_italic(): page_stats[4] += 1 if bbox.is_fixedwidth(): page_stats[5] += 1 page_stats[6] += bbox.font_size() cindex = bbox.contents_offset() tag = None if postags: # advance to first POS tag which might apply to cindex while ((postags_index < len(postags)) and (cindex >= (postags[postags_index].start + postags[postags_index].length))): postags_index = postags_index + 1 # might be cindex positions for which we have not tags -- check for that if ((postags_index < len(postags)) and (cindex >= postags[postags_index].start) and (cindex < (postags[postags_index].start + postags[postags_index].length))): tag = postags[postags_index] if paras and (paras[0].first_byte <= (cindex + char_count)) and (paras[0].first_byte_not >= cindex): # starts this paragraph if tag is None: tag = POSTag(cindex, char_count, None, "", True, False, False) else: tag.starts_paragraph = True paras = paras[1:] # again, add back in the 20-pixel border on the page ulx = trunc((bbox.left() + translation[0]) * scaling[0] + 0.5) uly = trunc((bbox.top() + translation[1]) * scaling[1] + 0.5) lrx = trunc((bbox.right() + translation[0]) * scaling[0] + 0.5) lry = trunc((bbox.bottom() + translation[1]) * scaling[1] + 0.5) adjusted_bboxes.append((bbox, tag, ulx, uly, lrx, lry)) last_cindex = cindex if (len(adjusted_bboxes) > 0): startpoint = adjusted_bboxes[0][0].contents_offset() endpoint = adjusted_bboxes[-1][0].contents_offset() + (adjusted_bboxes[-1][0].nchars() * 4) text_file.seek(startpoint + first_byte) pagetext = text_file.read(endpoint - startpoint) pagestart = startpoint else: pagetext = "" pagestart = last_cindex flush_page (dirpath, page_index, adjusted_bboxes, pagetext, pagestart) stats = update_stats(stats, page_stats) text_file.close() wordbox_file.close() dstats = update_stats("", doc_stats) update_metadata(os.path.join(dirpath, "metadata.txt"), { "wordbbox-stats-pagewise": stats, "wordbbox-stats-docwise": dstats})
def rip (self, folder, docid): def encodestring(s): # nytimes strings have xml char refs, and we want Unicode if not s: return s s = re.sub(r"&#([0-9]+);", lambda x: unichr(int(x.group(1))), s) s = re.sub(r"&([a-z]+);", lambda x: htmlentitydefs.name2codepoint(x.group(1)), s) return s mdpath = os.path.join(folder, "metadata.txt") originalspath = os.path.join(folder, "originals", "original.html") if not (os.path.exists(mdpath) and os.path.exists(originalspath)): return md = read_metadata(mdpath) url = md.get("original-url") if not url: return host, port, path = parse_URL(url) if host != "www.nytimes.com": return # OK, it's from the NY Times new_metadata = MetadataGatherer.parse(originalspath) if "source" not in md: md["source"] = "New York Times" # not all articles have metadata... if not ((('title' in new_metadata) or ('hdl' in new_metadata)) and ('pdate' in new_metadata)): note(3, "No metadata in article: %s", new_metadata) return md["title"] = encodestring(new_metadata.get("hdl") or md.get("title")) if "date" not in md: # get the date d = new_metadata.get("pdate") md["date"] = "%s/%s/%s" % (d[4:6], d[6:], d[:4]) if "authors" not in md: # get the byline d = new_metadata.get("byl") if d: if d.startswith("By "): d = d[3:] # capitalize properly d = d.title() # lowercase "And" d = d.replace(" And ", " and ") md["authors"] = encodestring(d) d = new_metadata.get("keywords") d0 = md.get("keywords") if d0: d0 += ("," + d) else: d0 = d if d0: md["keywords"] = encodestring(d0) if new_metadata.get("description"): md["summary"] = encodestring(new_metadata.get("description")) update_metadata(mdpath, md)
def thumbnail_translation_and_scaling (folder, d=None, update=true, recalc=false): # 'translation' is in units of points # 'scaling' is in units of pixels/point if d is None: d = dict() def find_data (key): return d.get(key) or doc_metadata.get(key) def parse_value (x): if x is None: return None elif type(x) in types.StringTypes: return eval('(' + x + ')') elif type(x) in types.TupleType: return x else: raise ValueError("argument " + str(x) + " must be string or tuple") metadata_file = os.path.join(folder, "metadata.txt") doc_metadata = read_metadata(metadata_file) if recalc: translation = None scaling = None else: translation = parse_value(doc_metadata.get("big-thumbnail-translation-points")) scaling = parse_value(doc_metadata.get("big-thumbnail-scaling-factor")) if scaling is None or translation is None: cropbox_data = find_data("cropping-bounding-box") images_size = eval('(%s)' % find_data("images-size")) if cropbox_data: cropbox = [eval('(%s)' % x) for x in cropbox_data.split(';')] else: cropbox = [(0,0), images_size] big_thumbnail_size = find_data("big-thumbnail-size") if big_thumbnail_size: big_tn_size = eval('(%s)' % big_thumbnail_size) else: from PIL import Image big_tn_size = Image.open(os.path.join(folder, "thumbnails", "big1.png")).size ppi = int(find_data("tiff-dpi") or find_data("images-dpi") or 300) # Remember that cropped page images have a 20 pixel border added back after scaling. # left_crop_border = 0 right_crop_border = 0 top_crop_border = 0 bottom_crop_border = 0 if cropbox_data: if cropbox[0][0] != 0: left_crop_border = 20 if cropbox[0][1] != 0: top_crop_border = 20 if cropbox[1][0] != images_size[0]: right_crop_border = 20 if cropbox[1][1] != images_size[1]: bottom_crop_border = 20 # calculate a translation quantity in "points" translation = (0 - float((cropbox[0][0] - left_crop_border) * 72)/ppi, 0 - float((cropbox[0][1] - top_crop_border) * 72)/ppi) # calculate a scaling factor that goes from bounding box edges in "points" to # scaled thumbnail coordinates in "pixels" # scaling = (float(ppi * big_tn_size[0])/float(72 * (cropbox[1][0] - cropbox[0][0] + (left_crop_border + right_crop_border))), float(ppi * big_tn_size[1])/float(72 * (cropbox[1][1] - cropbox[0][1] + (top_crop_border + bottom_crop_border)))) # now read the wordboxes and calculate the thumbnail bounding boxes for them note(4, " for %s: translation is %f, %f, scaling is %f, %f", folder, translation[0], translation[1], scaling[0], scaling[1]) if update: update_metadata(metadata_file, {'big-thumbnail-scaling-factor' : "%f,%f" % scaling, 'big-thumbnail-translation-points' : "%f,%f" % translation}) return translation, scaling
def do_thumbnails (dirpath, output_dir, **params): note(2, " thumbnailing in %s...", dirpath) tmpdir = tempfile.mktemp() retval = params.get('returnvalue', false) doc_metadata_path = os.path.join(dirpath, "metadata.txt") try: os.mkdir(tmpdir) os.chmod(tmpdir, 0700) try: md = read_metadata(doc_metadata_path) is_temporary_doc = md.get("temporary-contents") if is_temporary_doc and (is_temporary_doc == "true"): # temporary -- don't spend much time on this create_temporary_icons (md, dirpath, output_dir, params) retval = true return if os.path.exists(os.path.join(dirpath, "document.tiff")): # contains one single-page TIFF file tiffmaster = os.path.join(tmpdir, "master.tiff") split_command = (TIFF_SPLIT_CMD % (TIFFCP, os.path.join(dirpath, "document.tiff"), tiffmaster, TIFFSPLIT, tiffmaster, os.path.join(tmpdir, "x"))) status, output, tsignal = subproc(split_command) if status != 0: raise Error ("'%s' signals non-zero exit status %d in %s => %s" % (split_command, status, dirpath, tmpdir)) parts_dir = tmpdir filecheck_fn = lambda fn: fn[0] == "x" elif (os.path.exists(os.path.join(dirpath, "page-images")) and os.path.isdir(os.path.join(dirpath, "page-images"))): # contains directory full of PNG page images parts_dir = os.path.join(dirpath, "page-images") filecheck_fn = lambda fn: (fn.startswith('page') and fn.endswith('.png')) else: raise Error("No page images for document in %s" % dirpath) tiff_parts = os.listdir(parts_dir) if len(tiff_parts) < 1: raise Error("No pages in split tiff file directory after split!") # either a PNG-images or a TIFF split will sort properly in lexicographic order tiff_parts.sort() # see if there's a document icon legend and info about the DPI of the tiff file legend = md.get('document-icon-legend') tiff_dpi = int(md.get('images-dpi') or md.get('tiff-dpi') or params.get('images-dpi') or 0) page_numbers_v = md.get('page-numbers') page_numbers = (page_numbers_v and figure_page_numbers(page_numbers_v, dirpath)) first_page = int(md.get('first-page-number', 1)) skips = md.get('document-bbox-pages-to-skip', '') if skips: parts = string.split(skips, ':') bbox_skips = [] for part in parts: bbox_skips = bbox_skips + map(int, string.split(part, ',')) else: bbox_skips = None # figure bounding box for imaged page page_count = 0 bbox = None note(2, " calculating bounding box for large pages...") dont_crop = md.get('dont-crop-big-thumbnails', false) if AUTO_CROP_BIG_THUMBNAILS and not dont_crop: do_bbox = true else: do_bbox = false for tiff_part in tiff_parts: if not filecheck_fn(tiff_part): continue if page_count == 0: # find the width and height of the document docwidth, docheight = figure_doc_size(os.path.join(parts_dir, tiff_part)) if not do_bbox: bbox = (0, 0, docwidth, docheight) if do_bbox: bbox = figure_bbox (os.path.join(parts_dir, tiff_part), page_count, bbox, bbox_skips) if (bbox and bbox[0] == 0) and (bbox[1] == 0) and (bbox[2] >= docwidth) and (bbox[3] >= docheight): # don't bother, there's no area to crop already do_bbox = false page_count = page_count + 1 if page_count == 0: raise Error("No pages in split tiff file directory after split!") note(2, " final bbox is %s, page_count is %d", bbox, page_count) if USE_VIRTUAL_INK: note(2, " alpha channels will be added to large thumbnails...") # now make the thumbnails big_thumbnail_size = [] small_thumbnail_size = [] icon_size = [] page_index = 0 for tiff_part in tiff_parts: if not filecheck_fn(tiff_part): note(3, " skipping %s", tiff_part) continue tiff_path = os.path.join(parts_dir, tiff_part) if page_numbers: page_no_string = page_numbers.get(page_index) else: page_no_string = None note (2, " page %d%s", page_index, (page_no_string and " (%s)" % page_no_string) or "") try: if not create_thumbnail(tiff_path, tiff_dpi, output_dir, page_index, first_page, page_count, bbox, bbox_skips, big_thumbnail_size, small_thumbnail_size, icon_size, params.get('maxwidth'), params.get('maxheight'), params.get('maxscaling'), params.get('thumbnail_strategy'), legend, page_no_string): raise Error ("Can't create thumbnail for page %d in %s (of %s)" % (page_index, tiff_path, dirpath)) except Exception, x: doc_id = os.path.split(dirpath)[1] note("exception creating thumbnails for page %d of document %s:\n%s", page_index, doc_id, string.join(traceback.format_exception(*sys.exc_info()), "")) raise AbortDocumentIncorporation(doc_id, str(x)) if page_index == 0: bt_width = big_thumbnail_size[0] bt_height = big_thumbnail_size[1] st_width = small_thumbnail_size[0] st_height = small_thumbnail_size[1] else: bt_width = max(bt_width, big_thumbnail_size[0]) bt_height = max(bt_height, big_thumbnail_size[1]) st_width = max(st_width, small_thumbnail_size[0]) st_height = max(st_height, small_thumbnail_size[1]) st_scaling = (float(st_width)/float(docwidth) + float(st_height)/float(docheight)) / 2.0 page_index = page_index + 1 d = {"page-count" : str(page_count), "tiff-width" : str(docwidth), "images-width" : str(docwidth), "images-size" : "%d,%d" % (docwidth, docheight), "cropping-bounding-box" : "%d,%d;%d,%d" % (bbox), "big-thumbnail-size" : "%s,%s" % (bt_width, bt_height), "small-thumbnail-size" : "%s,%s" % (st_width, st_height), "small-thumbnail-scaling" : "%f" % st_scaling, "icon-size" : "%d,%d" % icon_size[0], "images-height" : str(docheight), "tiff-height" : str(docheight) } translation, scaling = thumbnail_translation_and_scaling(dirpath, d, false, true) d["big-thumbnail-translation-points"] = "%f,%f" % translation d["big-thumbnail-scaling-factor"] = "%f,%f" % scaling update_metadata(os.path.join(dirpath, "metadata.txt"), d) finally: shutil.rmtree(tmpdir) # indicate successful completion note(2, " finished.") retval = true
def rip(self, folder, docid): def encodestring(s): # WashPost strings have xml char refs, and we want Unicode if not s: return s s = re.sub(r"&#([0-9]+);", lambda x: unichr(int(x.group(1))), s) s = re.sub(r"&([a-z]+);", lambda x: htmlentitydefs.name2codepoint(x.group(1)), s) return s def dequote(s): return re.sub(r"\\'", "'", s) def catclean(s): return re.sub(r"[/,]", "_", s) mdpath = os.path.join(folder, "metadata.txt") originalspath = os.path.join(folder, "originals", "original.html") if not (os.path.exists(mdpath) and os.path.exists(originalspath)): return md = read_metadata(mdpath) url = md.get("original-url") if not url: return host, port, path = parse_URL(url) if host != "www.washingtonpost.com": return # OK, it's from the Post new_metadata = MetadataGatherer.parse(originalspath) for line in open(originalspath): if line.startswith(_HEADLINE): line = line[len(_HEADLINE) :].strip("\n") t = _TITLEPATTERN.match(line) if t: new_metadata["hdl"] = dequote(t.group("title")) m = _AUTHORSPATTERN.search(line) if m: new_metadata["authors"] = dequote(line[len(m.group(0)) :].strip(" ';\n")) if line.startswith(_CONTENTID): new_metadata["content-id"] = line[len(_CONTENTID) :].strip(" ';\n") if line.startswith(_SECTION): section = line[len(_SECTION) :].strip(" ';\n") i = section.index("'") new_metadata["section"] = section[:i] if "source" not in md: md["source"] = "Washington Post" # not all articles have metadata... if not ("hdl" in new_metadata): note(3, "No metadata in article: %s", new_metadata) return md["title"] = encodestring(new_metadata.get("hdl") or md.get("title")) if "date" not in md: # get the date d = _URLDATEPATTERN.match(url) if d: md["date"] = "%s/%s/%s" % (d.group("month"), d.group("day"), d.group("year")) if "authors" not in md: # get the byline d = new_metadata.get("authors") if d: md["authors"] = encodestring(d) d = new_metadata.get("keywords") d0 = md.get("keywords") if d and d0: d0 = [x.strip() for x in d0.split(",")] + [x.strip() for x in d.split(";")] elif d: d0 = [x.strip() for x in d.split(";")] if d0: md["keywords"] = encodestring(",".join(d0)) if new_metadata.get("description"): md["summary"] = encodestring(new_metadata.get("description")) md["abstract"] = encodestring(new_metadata.get("description")) section = new_metadata.get("section") if section: c = md.get("categories") if c: c = [x.strip() for x in c.split(",")] else: c = [] c = c + ["article", "Washington Post/%s" % catclean(section)] md["categories"] = ",".join(c) content_id = new_metadata.get("content-id") if content_id: md["citation"] = "Washington Post article %s" % content_id update_metadata(mdpath, md)
def update_folder_metadata(self, location, md): return update_metadata(self.folder_metadata_path(location), md)
def flesh_out_folder(id, tmpfilename, metadata, repo, unpack_fn, counter): try: try: # note(3, "CODETIMER_ON is %s", CODETIMER_ON) # if CODETIMER_ON: # code_timer.Init() # code_timer.CreateTable("uplib") # code_timer.CodeTimerOn() # code_timer.StartInt("newFolder$unpack", "uplib") # else: # code_timer.CodeTimerOff() if unpack_fn and tmpfilename and os.path.exists(tmpfilename): unpack_fn(repo, id, tmpfilename, metadata) # if CODETIMER_ON: # code_timer.StopInt("newFolder$unpack", "uplib") folderpath = repo.pending_location(id) try: note("unpacked new folder in %s", folderpath) if not sys.platform.lower().startswith("win"): s, o, t = subproc("ls -Rl %s" % folderpath) note("%s\n" % o) fp = open(os.path.join(folderpath, "UNPACKED"), 'w') fp.flush() fp.close() # as of this point, we can restart the inclusion of the document md = read_metadata(os.path.join(folderpath, "metadata.txt")) replacement_id = md.get("replacement-contents-for") if replacement_id: if repo.valid_doc_id(replacement_id): # contents to replace another document md["replacement-contents-for"] = "" update_metadata(os.path.join(folderpath, "metadata.txt"), md) note(2, "replacing contents of %s with this data...", replacement_id) existing_document = repo.get_document(replacement_id) new_folder = existing_document.folder() process_folder(repo, replacement_id, folderpath, false, new_folder) _run_rippers(new_folder, repo, replacement_id) existing_document.recache() repo.touch_doc(existing_document) raise AbortDocumentIncorporation(id, "replacement for existing document %s" % replacement_id) else: raise AbortDocumentIncorporation(id, "replacement for non-existent document %s" % replacement_id) _finish_inclusion (repo, folderpath, id) # if CODETIMER_ON: # noteOut = StringIO.StringIO() # noteOut.write("\nCode Timer statistics (what took time, in milliseconds):\n") # code_timer.PrintTable(noteOut, "uplib") # noteOut.write("\n") # noteOutString = noteOut.getvalue() # note(3, noteOutString) except: type, value, tb = sys.exc_info() note("%s", ''.join(traceback.format_exception(type, value, tb))) note_error(folderpath, (type, value, tb)) raise value, None, tb except AbortDocumentIncorporation, x: # ripper signalled to stop adopting this document, for good note(2, "AbortDocumentIncorporation exception on %s: %s", x.id, x.message) if (x.id == id): shutil.rmtree(folderpath) remove_from_index(repo.index_path(), id) except: type, value, tb = sys.exc_info() note("Exception processing new folder:\n%s", ''.join(traceback.format_exception(type, value, tb)))
def rip (self, location, doc_id, debug=None): images = findimages(location, debug) val = string.join([string.join(x, ":") for x in images], ',') update_metadata(os.path.join(location, "metadata.txt"), { 'illustrations-bounding-boxes' : val })