def main(): pconfig = config.PaperworkConfig() pconfig.read() src_dir = pconfig.settings['workdir'].value print("Source work directory : {}".format(src_dir)) src_dsearch = docsearch.DocSearch(src_dir) dst_doc_dir = tempfile.mkdtemp(suffix="paperwork-simulate-docs") dst_index_dir = tempfile.mkdtemp(suffix="paperwork-simulate-index") print("Destination directories : {} | {}".format(dst_doc_dir, dst_index_dir)) dst_dsearch = docsearch.DocSearch(dst_doc_dir, indexdir=dst_index_dir) try: documents = [x for x in src_dsearch.docs] documents.sort(key=lambda doc: doc.docid) for src_doc in documents: print("Document [{}]".format(src_doc.docid)) files = os.listdir(src_doc.path) files.sort() current_doc = None for filename in files: if "thumb" in filename: continue filepath = os.path.join(src_doc.path, filename) fileuri = "file://" + filepath importers = docimport.get_possible_importers( fileuri, current_doc=current_doc) if len(importers) <= 0: continue assert (len(importers) == 1) importer = importers[0] (docs, page, new) = importer.import_doc(fileuri, dst_dsearch, current_doc) dst_doc = docs[0] for page_nb in xrange(0, dst_doc.nb_pages): if dst_doc.can_edit: dst_doc.pages[page_nb].boxes = \ src_doc.pages[page_nb].boxes dst_doc.pages[page_nb].drop_cache() if current_doc is None: # first page --> guess labels and see if it matchs label_guess(dst_dsearch, src_doc, dst_doc) fix_labels(dst_dsearch, src_doc, dst_doc) else: # just update the index upd_index(dst_dsearch, dst_doc, new=False) current_doc = docs[0] finally: rm_rf(dst_doc_dir) rm_rf(dst_index_dir) print_stats()
def destroy(self): """ Delete the document. The *whole* document. There will be no survivors. """ logger.info("Destroying doc: %s" % self.path) rm_rf(self.path) logger.info("Done") self.drop_cache()
def destroy_index(self): """ Destroy the index. Don't use this DocSearch object anymore after this call. Next instantiation of a DocSearch will rebuild the whole index """ logger.info("Destroying the index ...") rm_rf(self.indexdir) rm_rf(self.label_guesser_dir) logger.info("Done")
def cleanup_rootdir(self, progress_cb=dummy_progress_cb): """ Remove all the crap from the work dir (temporary files, empty directories, etc) """ progress_cb(0, 1, self.INDEX_STEP_CLEANING) for filename in os.listdir(self.rootdir): filepath = os.path.join(self.rootdir, filename) if self.__must_clean(filepath): logger.info("Cleanup: Removing '%s'" % filepath) rm_rf(filepath) elif os.path.isdir(filepath): # we only want to go one subdirectory deep, no more for subfilename in os.listdir(filepath): subfilepath = os.path.join(filepath, subfilename) if self.__must_clean(subfilepath): logger.info("Cleanup: Removing '%s'" % subfilepath) rm_rf(subfilepath) progress_cb(1, 1, self.INDEX_STEP_CLEANING)
def delete_features_files(self): rm_rf(os.path.join(self.path, self.FEATURES_DIR))
def destroy(self): """ Delete the document. The *whole* document. There will be no survivors. """ rm_rf(self.path) self.drop_cache()