示例#1
0
def main():
    pconfig = config.PaperworkConfig()
    pconfig.read()

    src_dir = pconfig.settings['workdir'].value
    print("Source work directory : {}".format(src_dir))
    src_dsearch = docsearch.DocSearch(src_dir)

    dst_doc_dir = tempfile.mkdtemp(suffix="paperwork-simulate-docs")
    dst_index_dir = tempfile.mkdtemp(suffix="paperwork-simulate-index")
    print("Destination directories : {} | {}".format(dst_doc_dir,
                                                     dst_index_dir))
    dst_dsearch = docsearch.DocSearch(dst_doc_dir, indexdir=dst_index_dir)

    try:
        documents = [x for x in src_dsearch.docs]
        documents.sort(key=lambda doc: doc.docid)

        for src_doc in documents:
            print("Document [{}]".format(src_doc.docid))
            files = os.listdir(src_doc.path)
            files.sort()

            current_doc = None
            for filename in files:
                if "thumb" in filename:
                    continue
                filepath = os.path.join(src_doc.path, filename)
                fileuri = "file://" + filepath
                importers = docimport.get_possible_importers(
                    fileuri, current_doc=current_doc)
                if len(importers) <= 0:
                    continue
                assert (len(importers) == 1)
                importer = importers[0]
                (docs, page,
                 new) = importer.import_doc(fileuri, dst_dsearch, current_doc)
                dst_doc = docs[0]

                for page_nb in xrange(0, dst_doc.nb_pages):
                    if dst_doc.can_edit:
                        dst_doc.pages[page_nb].boxes = \
                            src_doc.pages[page_nb].boxes
                        dst_doc.pages[page_nb].drop_cache()

                if current_doc is None:
                    # first page --> guess labels and see if it matchs
                    label_guess(dst_dsearch, src_doc, dst_doc)
                    fix_labels(dst_dsearch, src_doc, dst_doc)
                else:
                    # just update the index
                    upd_index(dst_dsearch, dst_doc, new=False)

                current_doc = docs[0]

    finally:
        rm_rf(dst_doc_dir)
        rm_rf(dst_index_dir)
        print_stats()
示例#2
0
 def destroy(self):
     """
     Delete the document. The *whole* document. There will be no survivors.
     """
     logger.info("Destroying doc: %s" % self.path)
     rm_rf(self.path)
     logger.info("Done")
     self.drop_cache()
示例#3
0
 def destroy_index(self):
     """
     Destroy the index. Don't use this DocSearch object anymore after this
     call. Next instantiation of a DocSearch will rebuild the whole index
     """
     logger.info("Destroying the index ...")
     rm_rf(self.indexdir)
     rm_rf(self.label_guesser_dir)
     logger.info("Done")
示例#4
0
 def cleanup_rootdir(self, progress_cb=dummy_progress_cb):
     """
     Remove all the crap from the work dir (temporary files, empty
     directories, etc)
     """
     progress_cb(0, 1, self.INDEX_STEP_CLEANING)
     for filename in os.listdir(self.rootdir):
         filepath = os.path.join(self.rootdir, filename)
         if self.__must_clean(filepath):
             logger.info("Cleanup: Removing '%s'" % filepath)
             rm_rf(filepath)
         elif os.path.isdir(filepath):
             # we only want to go one subdirectory deep, no more
             for subfilename in os.listdir(filepath):
                 subfilepath = os.path.join(filepath, subfilename)
                 if self.__must_clean(subfilepath):
                     logger.info("Cleanup: Removing '%s'" % subfilepath)
                     rm_rf(subfilepath)
     progress_cb(1, 1, self.INDEX_STEP_CLEANING)
示例#5
0
 def delete_features_files(self):
     rm_rf(os.path.join(self.path, self.FEATURES_DIR))
示例#6
0
文件: doc.py 项目: smurfix/paperwork
 def destroy(self):
     """
     Delete the document. The *whole* document. There will be no survivors.
     """
     rm_rf(self.path)
     self.drop_cache()