class JobDocScan(Job): __gsignals__ = { 'scan-start': (GObject.SignalFlags.RUN_LAST, None, # current page / total (GObject.TYPE_INT, GObject.TYPE_INT)), 'ocr-start': (GObject.SignalFlags.RUN_LAST, None, # current page / total (GObject.TYPE_INT, GObject.TYPE_INT)), 'scan-done': (GObject.SignalFlags.RUN_LAST, None, # current page, total (GObject.TYPE_PYOBJECT, GObject.TYPE_INT)), 'scan-error': (GObject.SignalFlags.RUN_LAST, None, # exception (GObject.TYPE_PYOBJECT,)), } can_stop = True priority = 500 def __init__(self, factory, id, config, nb_pages, line_in_treeview, docsearch, doc, scan_src): Job.__init__(self, factory, id) self.__config = config self.__scan_src = scan_src self.docsearch = docsearch self.doc = doc self.nb_pages = nb_pages self.line_in_treeview = line_in_treeview self.current_page = None def __progress_cb(self, progression, total, step=None): if progression == 0 and step == ImgPage.SCAN_STEP_OCR: self.emit('ocr-start', self.current_page, self.nb_pages) def do(self): if self.doc is None: self.doc = ImgDoc(self.__config.workdir) for self.current_page in range(0, self.nb_pages): self.emit('scan-start', self.current_page, self.nb_pages) try: self.doc.scan_single_page(self.__scan_src, self.__config.scanner_resolution, self.__config.scanner_calibration, self.__config.langs, self.__progress_cb) page = self.doc.pages[self.doc.nb_pages - 1] self.docsearch.index_page(page) self.emit('scan-done', page, self.nb_pages) except StopIteration, exc: logger.warning("Feeder appears to be empty and we " "haven't scanned all the pages yet !") self.emit('scan-error', exc) self._wait(5.0, force=True) # wait for all the jobs to be cancelled return except Exception, exc: logger.error("Error: Exception: %s" % str(exc)) self.emit('scan-error', exc) self._wait(5.0, force=True) # wait for all the jobs to be cancelled return
def import_doc(self, file_uri, config, docsearch, current_doc=None): if current_doc == None: current_doc = ImgDoc(config.workdir) current_doc.import_image(file_uri, config.ocrlang) page = current_doc.pages[current_doc.nb_pages - 1] docsearch.index_page(page) return (current_doc, page)
def import_doc(self, file_uri, config, docsearch, current_doc=None): if current_doc == None: current_doc = ImgDoc(config.workdir) current_doc.import_image(file_uri, config.ocrlang) page = current_doc.pages[current_doc.nb_pages-1] docsearch.index_page(page) return (current_doc, page)
def do(self): if self.doc is None: self.doc = ImgDoc(self.__config.workdir) for self.current_page in range(0, self.nb_pages): self.emit('scan-start', self.current_page, self.nb_pages) try: self.doc.scan_single_page(self.__scan_src, self.__config.scanner_resolution, self.__config.scanner_calibration, self.__config.langs, self.__progress_cb) page = self.doc.pages[self.doc.nb_pages - 1] self.docsearch.index_page(page) self.emit('scan-done', page, self.nb_pages) except StopIteration, exc: logger.warning("Feeder appears to be empty and we " "haven't scanned all the pages yet !") self.emit('scan-error', exc) self._wait(5.0, force=True) # wait for all the jobs to be cancelled return except Exception, exc: logger.error("Error: Exception: %s" % str(exc)) self.emit('scan-error', exc) self._wait(5.0, force=True) # wait for all the jobs to be cancelled return
class DocScanWorker(Worker): __gsignals__ = { 'scan-start': ( GObject.SignalFlags.RUN_LAST, None, # current page / total (GObject.TYPE_INT, GObject.TYPE_INT)), 'ocr-start': ( GObject.SignalFlags.RUN_LAST, None, # current page / total (GObject.TYPE_INT, GObject.TYPE_INT)), 'scan-done': ( GObject.SignalFlags.RUN_LAST, None, # current page / total (GObject.TYPE_PYOBJECT, GObject.TYPE_INT)), } can_interrupt = True def __init__(self, config, nb_pages, line_in_treeview, docsearch, doc=None): Worker.__init__(self, "Document scanner (doc %d)" % (line_in_treeview)) self.__config = config self.docsearch = docsearch self.doc = doc self.nb_pages = nb_pages self.line_in_treeview = line_in_treeview self.current_page = None def __progress_cb(self, progression, total, step=None): if not self.can_run: raise Exception("Scan interrupted") if progression == 0 and step == ImgPage.SCAN_STEP_OCR: self.emit('ocr-start', self.current_page, self.nb_pages) def do(self, scan_src): if self.doc == None: self.doc = ImgDoc(self.__config.workdir) for self.current_page in range(0, self.nb_pages): self.emit('scan-start', self.current_page, self.nb_pages) self.doc.scan_single_page(scan_src, self.__config.scanner_resolution, self.__config.scanner_calibration, self.__config.ocrlang, self.__progress_cb) page = self.doc.pages[self.doc.nb_pages - 1] self.docsearch.index_page(page) self.emit('scan-done', page, self.nb_pages) self.current_page = None
def import_doc(file_uri, config, docsearch, current_doc=None): """ Import the specified image """ logger.info("Importing doc '%s'" % (file_uri)) if current_doc is None: current_doc = ImgDoc(config.workdir) current_doc.import_image(file_uri, config.langs) page = current_doc.pages[current_doc.nb_pages - 1] docsearch.index_page(page) return (current_doc, page)
def import_doc(file_uri, config, docsearch, current_doc=None): """ Import the specified image """ logger.info("Importing doc '%s'" % (file_uri)) if current_doc is None: current_doc = ImgDoc(config.workdir) current_doc.import_image(file_uri, config.langs) page = current_doc.pages[current_doc.nb_pages-1] docsearch.index_page(page) return (current_doc, page)
def import_doc(file_uri, config, docsearch, current_doc=None): """ Import the specified image """ logger.info("Importing doc '%s'" % (file_uri)) if current_doc is None: current_doc = ImgDoc(config.settings['workdir'].value) new = current_doc.is_new if file_uri[:7] == "file://": # XXX(Jflesch): bad bad bad file_uri = file_uri[7:] img = Image.open(file_uri) page = current_doc.add_page(img, []) return ([current_doc], page, new)
def do(self, scan_src): if self.doc == None: self.doc = ImgDoc(self.__config.workdir) for self.current_page in range(0, self.nb_pages): self.emit('scan-start', self.current_page, self.nb_pages) self.doc.scan_single_page(scan_src, self.__config.scanner_resolution, self.__config.scanner_calibration, self.__config.ocrlang, self.__progress_cb) page = self.doc.pages[self.doc.nb_pages - 1] self.docsearch.index_page(page) self.emit('scan-done', page, self.nb_pages) self.current_page = None
def import_doc(file_uri, docsearch, current_doc=None): """ Import the specified image """ logger.info("Importing doc '%s'" % (file_uri)) if current_doc is None: current_doc = ImgDoc(docsearch.rootdir) new = current_doc.is_new if file_uri[:7] == "file://": # XXX(Jflesch): bad bad bad file_uri = urllib.unquote(file_uri[7:]) img = Image.open(file_uri) page = current_doc.add_page(img, []) return ([current_doc], page, new)
class DocScanWorker(Worker): __gsignals__ = { 'scan-start' : (gobject.SIGNAL_RUN_LAST, gobject.TYPE_NONE, # current page / total (gobject.TYPE_INT, gobject.TYPE_INT)), 'ocr-start' : (gobject.SIGNAL_RUN_LAST, gobject.TYPE_NONE, # current page / total (gobject.TYPE_INT, gobject.TYPE_INT)), 'scan-done' : (gobject.SIGNAL_RUN_LAST, gobject.TYPE_NONE, # current page / total (gobject.TYPE_INT, gobject.TYPE_INT)), } can_interrupt = True def __init__(self, config, nb_pages, line_in_treeview, docsearch, doc=None): Worker.__init__(self, "Document scanner (doc %d)" % (line_in_treeview)) self.__config = config self.docsearch = docsearch self.doc = doc self.nb_pages = nb_pages self.line_in_treeview = line_in_treeview self.current_page = None def __progress_cb(self, progression, total, step=None): if not self.can_run: raise Exception("Scan interrupted") if progression == 0 and step == ImgPage.SCAN_STEP_OCR: self.emit('ocr-start', self.current_page, self.nb_pages) def do(self, scan_src): if self.doc == None: self.doc = ImgDoc(self.__config.workdir) for self.current_page in range(0, self.nb_pages): self.emit('scan-start', self.current_page, self.nb_pages) self.doc.scan_single_page(scan_src, self.__config.scanner_resolution, self.__config.scanner_calibration, self.__config.ocrlang, self.__progress_cb) page = self.doc.pages[self.doc.nb_pages - 1] self.docsearch.index_page(page) self.emit('scan-done', self.current_page, self.nb_pages) self.current_page = None
def examine_rootdir(self, on_new_doc, on_doc_modified, on_doc_deleted, on_doc_unchanged, progress_cb=dummy_progress_cb): """ Examine the rootdir. Calls on_new_doc(doc), on_doc_modified(doc), on_doc_deleted(docid) every time a new, modified, or deleted document is found """ # getting the doc list from the index query = whoosh.query.Every() results = self.__searcher.search(query, limit=None) old_doc_list = [result['docid'] for result in results] old_doc_infos = {} for result in results: old_doc_infos[result['docid']] = (result['doctype'], result['last_read']) old_doc_list = set(old_doc_list) # and compare it to the current directory content docdirs = os.listdir(self.docsearch.rootdir) progress = 0 for docdir in docdirs: old_infos = old_doc_infos.get(docdir) doctype = None if old_infos is not None: doctype = old_infos[0] doc = self.docsearch.get_doc_from_docid(docdir, doctype, inst=True) if doc is None: continue if docdir in old_doc_list: old_doc_list.remove(docdir) assert(old_infos is not None) last_mod = datetime.datetime.fromtimestamp(doc.last_mod) if old_infos[1] != last_mod: on_doc_modified(doc) else: on_doc_unchanged(doc) else: on_new_doc(doc) progress_cb(progress, len(docdirs), DocSearch.INDEX_STEP_CHECKING, doc) progress += 1 # remove all documents from the index that don't exist anymore for old_doc in old_doc_list: # Will be a document with 0 pages docpath = os.path.join(self.docsearch.rootdir, old_doc) on_doc_deleted(ImgDoc(docpath, old_doc, label_store=self.docsearch.label_store)) progress_cb(1, 1, DocSearch.INDEX_STEP_CHECKING)
def do(self, scan_src): if self.doc == None: self.doc = ImgDoc(self.__config.workdir) for self.current_page in range(0, self.nb_pages): self.emit('scan-start', self.current_page, self.nb_pages) self.doc.scan_single_page(scan_src, self.__config.scanner_resolution, self.__config.scanner_calibration, self.__config.ocrlang, self.__progress_cb) page = self.doc.pages[self.doc.nb_pages - 1] self.docsearch.index_page(page) self.emit('scan-done', self.current_page, self.nb_pages) self.current_page = None
def main(src_dir, dst_dir): sys.stdout.write("Loading document %s ... " % src_dir) sys.stdout.flush() src_doc = ImgDoc(src_dir, os.path.basename(src_dir)) sys.stdout.write("Done\n") if (src_doc.nb_pages <= 0): raise Exception("No pages found. Is this an image doc ?") sys.stdout.write("Analyzing document ... ") sys.stdout.flush() chars = get_chars(src_doc) sys.stdout.write("Done\n") sys.stdout.write("Generating salt ... ") sys.stdout.flush() salt = gen_salt() sys.stdout.write("Done\n") print("Will use [%s] as salt for the hash" % salt) sys.stdout.write("Generating char mapping ... ") sys.stdout.flush() mapping = generate_mapping(chars) sys.stdout.write("Done\n") print_mapping(mapping) os.mkdir(dst_dir) sys.stdout.write("Generating document %s ... " % dst_dir) sys.stdout.flush() dst_doc = ImgDoc(dst_dir, os.path.basename(dst_dir)) clone_doc_content(src_doc, dst_doc, mapping, salt) sys.stdout.write("... Done\n") print("All done")
def __init__(self, main_win, config, widget_tree): self.__main_win = main_win self.__config = config self.default_thumbnail = self.__init_default_thumbnail( JobDocThumbnailer.SMALL_THUMBNAIL_WIDTH, JobDocThumbnailer.SMALL_THUMBNAIL_HEIGHT) self.gui = { 'list': widget_tree.get_object("listboxDocList"), 'box': widget_tree.get_object("doclist_box"), 'scrollbars': widget_tree.get_object("scrolledwindowDocList"), 'spinner': SpinnerAnimation((0, 0)), } self.gui['loading'] = Canvas(self.gui['scrollbars']) self.gui['loading'].set_visible(False) self.gui['box'].add(self.gui['loading']) self.gui['scrollbars'].connect( "size-allocate", lambda x, s: GLib.idle_add(self._on_size_allocate) ) self.actions = { 'open_doc': ( [ self.gui['list'], ], ActionOpenSelectedDocument(main_win, config, self) ), } connect_actions(self.actions) self.model = { 'has_new': False, 'by_row': {}, # Gtk.ListBoxRow: docid 'by_id': {}, # docid: Gtk.ListBoxRow # keep the thumbnails in cache 'thumbnails': {} # docid: pixbuf } self.new_doc = ImgDoc(config['workdir'].value) self.job_factories = { 'doc_thumbnailer': JobFactoryDocThumbnailer(self), } self.selected_doc = None self.gui['scrollbars'].get_vadjustment().connect( "value-changed", lambda v: GLib.idle_add(self._on_scrollbar_value_changed) ) self.gui['list'].connect("drag-motion", self._on_drag_motion) self.gui['list'].connect("drag-leave", self._on_drag_leave) self.gui['list'].connect( "drag-data-received", self._on_drag_data_received ) self.gui['list'].drag_dest_set( Gtk.DestDefaults.ALL, [], Gdk.DragAction.MOVE ) self.gui['list'].drag_dest_add_text_targets() self.accel_group = Gtk.AccelGroup() self.__main_win.window.add_accel_group(self.accel_group) self.show_loading()
def get_new_doc(self): if self.new_doc.is_new: return self.new_doc self.new_doc = ImgDoc(self.__config['workdir'].value) return self.new_doc