def update_metadata (self, newdict, reindex=true): lock_folder(self.__folder) if reindex: oldvals = self.get_metadata().copy() try: self.__metadata = p_update_metadata(self.metadata_path(), newdict) self.__date = None self.__category_strings = None self.__citation = None finally: unlock_folder(self.__folder) if reindex: # show_stack(0, "mysterious re-indexing") d = newdict.copy() for k in d.keys(): if oldvals.get(k) == d.get(k): del d[k] newthread = uthread.start_new_thread(_reindex_document_folder, (self.repo, self.__folder, self.id, d.keys())) note(3, "reindexing %s in %s", self.id, str(newthread))
def _rerip_worker(response, docs, ripper, other_rippers): fp = response.open("text/plain") try: for doc in docs: location = doc.folder() lock_folder(location) try: try: ripper.rip(location, doc.id) except: msg = ''.join(traceback.format_exception(*sys.exc_info())) note("Error running %s ripper:\n%s", ripper.name(), msg) fp.write("Error running %s ripper:\n%s" % (ripper.name(), msg)) else: fp.write("ripped %s with %s\n" % (doc.id, ripper.name())) reruns = [ripper.name(),] for r in doc.repo.rippers(): if ((other_rippers and (r.name() in other_rippers)) or any([r.rerun_after_other_ripper(x) for x in reruns])): try: r.rip(location, doc.id) except: msg = ''.join(traceback.format_exception(*sys.exc_info())) note("Error running %s ripper:\n%s", r.name(), msg) fp.write("Error running %s ripper:\n%s" % (r.name(), msg)) else: reruns.append(r.name()) fp.write("ripped %s with %s\n" % (doc.id, r.name())) for r in [x for x in other_rippers if isinstance(x, Ripper)]: if r.name() not in reruns: try: r.rip(location, doc.id) except: msg = ''.join(traceback.format_exception(*sys.exc_info())) note("Error running %s ripper:\n%s", r.name(), msg) fp.write("Error running %s ripper:\n%s" % (r.name(), msg)) else: reruns.append(r.name()) fp.write("ripped %s with %s\n" % (doc.id, r.name())) finally: unlock_folder(location) finally: fp.close()
def _run_rippers (folderpath, repo, id): rippers = repo.rippers() lock_folder(folderpath) try: which_ripper = os.path.join(folderpath, "RIPPING") try: for ripper in rippers: note("%s: running ripper %s", id, str(ripper)) fp = open(which_ripper, 'wb') fp.write(ripper.__class__.__name__) fp.close() try: # if CODETIMER_ON: # code_timer.StartInt("newFolder$ripping-%s" % ripper.name(), "uplib") ripper.rip(folderpath, id) finally: pass # if CODETIMER_ON: # code_timer.StopInt("newFolder$ripping-%s" % ripper.name(), "uplib") except AbortDocumentIncorporation: type, value, tb = sys.exc_info() raise value, None, tb except: type, value, tb = sys.exc_info() note("Running document rippers raised the following exception:\n%s", string.join(traceback.format_exception(type, value, tb))) note("Fixing the problem and restarting the UpLib angel will cause this document to be added to the repository.") # re-raise the exception raise value, None, tb else: if os.path.exists(which_ripper): os.unlink(which_ripper) finally: unlock_folder(folderpath)
def _reindex_document_folder(repo, folder, doc_id, changed_fields): try: import createIndexEntry, createHTML lock_folder(folder) try: note(3, "re-running some rippers on %s...", doc_id) standard_rippers = repo.rippers() rerun = [] for i in range(len(standard_rippers)): ripper = standard_rippers[i] try: if (ripper.rerun_after_metadata_changes(changed_fields=changed_fields) or any([ripper.rerun_after_other_ripper(x.name()) for x in rerun])): note(4, " re-running ripper %s on %s", ripper.name(), doc_id) ripper.rip(folder, doc_id) rerun.append(ripper) except: note("Exception running %s on %s:\n%s", ripper, doc_id, ''.join(traceback.format_exception(*sys.exc_info()))) finally: unlock_folder(folder) except: type, value, tb = sys.exc_info() note("while in _reindex_document_folder(%s):\n%s", doc_id, ''.join(traceback.format_exception(type, value, tb)))
def process_folder (repo, id, directory, delete_p, replace=None): def _protect_files (mode, dirname, files): for file in files: thepath = os.path.join(dirname, file) if os.path.isdir(thepath): os.chmod(thepath, 0700) else: os.chmod(thepath, 0600) note(2, "processing folder %s...", directory) description = None contents = None summary = None metadata = None wordbboxes = os.path.join(directory, "wordbboxes") tifffile = os.path.join(directory, "document.tiff") pageimagesdir = os.path.join(directory, "page-images") images = os.path.join(directory, "images") originals = os.path.join(directory, "originals") links = os.path.join(directory, "links") names = os.listdir(directory) for name in names: if string.lower(name) == "contents.txt": contents = os.path.join(directory, name) elif string.lower(name) == "summary.txt": summary = os.path.join(directory, name) elif string.lower(name) == "metadata.txt": metadata = os.path.join(directory, name) if replace is None: newdir = os.path.join(repo.pending_folder(), id) else: newdir = replace if not os.path.isdir(newdir): raise Error("Pending directory %s does not exist!" % newdir) try: lock_folder(newdir) try: if os.path.exists(images): destpath = os.path.join(newdir, "images") if replace and os.path.exists(destpath): shutil.rmtree(destpath) shutil.copytree (images, destpath) if delete_p: shutil.rmtree (images, true) if os.path.exists(originals): destpath = os.path.join(newdir, "originals") if replace and os.path.exists(destpath): shutil.rmtree(destpath) shutil.copytree (originals, destpath) if delete_p: shutil.rmtree (originals, true) if os.path.exists(links): destpath = os.path.join(newdir, "links") if replace and os.path.exists(destpath): shutil.rmtree(destpath) shutil.copytree (links, destpath) if delete_p: shutil.rmtree (links, true) if metadata: destpath = os.path.join(newdir, "metadata.txt") if replace and os.path.exists(destpath): os.unlink(destpath) shutil.copyfile(metadata, destpath) m = read_metadata(metadata) if m.has_key("title"): note("Title of uploaded folder is '%s'", m['title']) if delete_p: os.unlink(metadata) else: # create an empty metadata.txt destpath = os.path.join(newdir, "metadata.txt") if replace and os.path.exists(destpath): os.unlink(destpath) mdf = open(destpath, 'w') mdf.flush() mdf.close() newcontents = os.path.join(newdir, "contents.txt") if contents: if replace and os.path.exists(newcontents): os.unlink(newcontents) shutil.copyfile(contents, newcontents) if delete_p: os.unlink(contents) newsummary = os.path.join(newdir, "summary.txt") if summary: if replace and os.path.exists(newsummary): os.unlink(newsummary) shutil.copyfile(summary, newsummary) if delete_p: os.unlink(summary) if os.path.exists(wordbboxes): destpath = os.path.join(newdir, "wordbboxes") if replace and os.path.exists(destpath): os.unlink(destpath) shutil.copyfile(wordbboxes, destpath) if delete_p: os.unlink(wordbboxes) if os.path.exists(tifffile): destpath = os.path.join(newdir, "document.tiff") if replace and os.path.exists(destpath): os.unlink(destpath) shutil.copyfile(tifffile, destpath) if delete_p: os.unlink(tifffile) elif os.path.isdir(pageimagesdir): destpath = os.path.join(newdir, "page-images") if replace and os.path.exists(destpath): shutil.rmtree(destpath) shutil.copytree(pageimagesdir, destpath) if delete_p: shutil.rmtree(pageimagesdir, true) os.path.walk(newdir, _protect_files, None) os.chmod(newdir, 0700) return id finally: unlock_folder (newdir) except: type, value, tb = sys.exc_info() if os.path.exists(newdir) and not replace: shutil.rmtree(newdir) # re-raise the exception raise value, None, tb
def index_folders (docs_dir, doc_ids, repo_index_dir): update_configuration() if not doc_ids: return if LUCENE == 'jcc': c = get_context(repo_index_dir) SECTION_LOCK.acquire() try: for id in doc_ids: folderpath = os.path.join(docs_dir, id) if os.path.isdir(folderpath): lock_folder(folderpath) try: try: c.index(folderpath, id, False) except: note(0, "Can't index folder %s:\n%s", folderpath, ''.join(traceback.format_exception(*sys.exc_info()))) finally: unlock_folder(folderpath) c.reopen() finally: SECTION_LOCK.release() return else: # invoke Java to do indexing if len(doc_ids) > 6: fname = tempfile.mktemp() fp = open(fname, "w") fp.write(string.join(doc_ids, '\n')) fp.close() indexingcmd = INDEXING_BATCHADD_CMD % (JAVA, DEBUG_FLAGS, INDEXING_PROPERTIES, LUCENE_JAR, INDEXING_JAR, repo_index_dir, docs_dir, fname) note(3, " indexing with %s", indexingcmd) SECTION_LOCK.acquire() try: status, output, tsignal = subproc(indexingcmd) finally: SECTION_LOCK.release() os.unlink(fname) note(3, " indexing output is <%s>", output) if status != 0: raise Error ("%s signals non-zero exit status %d attempting to index %s:\n%s" % (JAVA, status, doc_ids, output)) else: folders = string.join(doc_ids, ' ') indexingcmd = INDEXING_ADD_CMD % (JAVA, DEBUG_FLAGS, INDEXING_PROPERTIES, LUCENE_JAR, INDEXING_JAR, repo_index_dir, docs_dir, folders) note(3, " indexing with %s", indexingcmd) SECTION_LOCK.acquire() try: status, output, tsignal = subproc(indexingcmd) finally: SECTION_LOCK.release() note(3, " indexing output is <%s>", output) if status != 0: raise Error ("%s signals non-zero exit status %d attempting to index %s:\n%s" % (JAVA, status, doc_ids, output))