Пример #1
0
    def __init__(self, rootdir, callback=dummy_progress_cb):
        """
        Index files in rootdir (see constructor)

        Arguments:
            callback --- called during the indexation (may be called *often*).
                step : DocSearch.INDEX_STEP_READING or
                    DocSearch.INDEX_STEP_SORTING
                progression : how many elements done yet
                total : number of elements to do
                document (only if step == DocSearch.INDEX_STEP_READING): file
                    being read
        """
        self.rootdir = rootdir
        base_indexdir = os.getenv("XDG_DATA_HOME",
                                  os.path.expanduser("~/.local/share"))
        self.indexdir = os.path.join(base_indexdir, "paperwork", "index")
        mkdir_p(self.indexdir)

        self.__docs_by_id = {}  # docid --> doc
        self.label_list = []

        need_index_rewrite = True
        try:
            logger.info("Opening index dir '%s' ..." % self.indexdir)
            self.index = whoosh.index.open_dir(self.indexdir)
            # check that the schema is up-to-date
            # We use the string representation of the schemas, because previous
            # versions of whoosh don't always implement __eq__
            if str(self.index.schema) == str(self.WHOOSH_SCHEMA):
                need_index_rewrite = False
        except whoosh.index.EmptyIndexError, exc:
            logger.warning("Failed to open index '%s'" % self.indexdir)
            logger.warning("Exception was: %s" % str(exc))
Пример #2
0
 def add_page(self, img, boxes):
     mkdir_p(self.path)
     page = ImgPage(self, self.nb_pages)
     page.img = img
     page.boxes = boxes
     self.drop_cache()
     return self.pages[-1]
Пример #3
0
 def add_page(self, img, boxes):
     mkdir_p(self.path)
     logger.info("Adding page %d to %s" % (self.nb_pages, str(self)))
     page = ImgPage(self, self.nb_pages)
     page.img = img
     page.boxes = boxes
     self.drop_cache()
     return self.pages[-1]
Пример #4
0
 def load(self, label_name, force_reload=False):
     label_hash = hex(abs(hash(label_name)))[2:]
     baye_dir = os.path.join(self._bayes_dir, label_hash)
     mkdir_p(baye_dir)
     if label_name not in self._bayes or force_reload:
         self._bayes[label_name] = simplebayes.SimpleBayes(
             cache_path=baye_dir
         )
         self._bayes[label_name].cache_train()
Пример #5
0
 def add_page(self, img, boxes):
     mkdir_p(self.path)
     logger.info("Adding page %d to %s", self.nb_pages, self)
     page = ImgPage(self, self.nb_pages)
     page.img = img
     page.boxes = boxes
     self.drop_cache()
     self._update_storage(1)
     return self.pages[-1]
Пример #6
0
    def steal_page(self, page):
        """
        Steal a page from another document
        """
        if page.doc == self:
            return
        mkdir_p(self.path)

        new_page = ImgPage(self, self.nb_pages)
        logger.info("%s --> %s" % (str(page), str(new_page)))
        new_page._steal_content(page)
        page.doc.drop_cache()
        self.drop_cache()
Пример #7
0
 def load(self, label_name, force_reload=False):
     label_hash = hex(abs(hash(label_name)))[2:]
     baye_dir = os.path.join(self._bayes_dir, label_hash)
     mkdir_p(baye_dir)
     if label_name not in self._bayes or force_reload:
         self._bayes[label_name] = simplebayes.SimpleBayes(
             cache_path=baye_dir)
         try:
             self._bayes[label_name].cache_train()
         except Exception:
             logger.exception(
                 "Could not load cache "
                 "for label '%s' from %s", label_name,
                 self._bayes[label_name].get_cache_location())
Пример #8
0
    def insert_page(self, img, boxes, page_nb):
        mkdir_p(self.path)

        logger.info("Inserting page %d to %s" % (page_nb, str(self)))

        if page_nb > self.nb_pages:
            page_nb = self.nb_pages

        # make a hole ..
        pages = self.pages
        for page_nb in range(self.nb_pages - 1, page_nb - 1, -1):
            page = pages[page_nb]
            page.change_index(offset=1)

        # .. and fill it
        page = ImgPage(self, page_nb)
        page.img = img
        page.boxes = boxes
        self.drop_cache()
        return self.pages[page_nb]
Пример #9
0
 def check_workdir(self):
     """
     Check that the current work dir (see config.PaperworkConfig) exists. If
     not, open the settings dialog.
     """
     mkdir_p(self.rootdir)
Пример #10
0
    def __init__(self, rootdir, indexdir=None,
                 callback=dummy_progress_cb, label_store=None):
        """
        Index files in rootdir (see constructor)

        Arguments:
            callback --- called during the indexation (may be called *often*).
                step : DocSearch.INDEX_STEP_READING or
                    DocSearch.INDEX_STEP_SORTING
                progression : how many elements done yet
                total : number of elements to do
                document (only if step == DocSearch.INDEX_STEP_READING): file
                    being read
        """
        assert(label_store)
        self.label_store = label_store
        self.rootdir = rootdir
        if indexdir is None:
            base_data_dir = os.getenv(
                "XDG_DATA_HOME",
                os.path.expanduser("~/.local/share")
            )
            indexdir = os.path.join(base_data_dir, "paperwork")
        self.indexdir = os.path.join(indexdir, "index")
        mkdir_p(self.indexdir)
        self.label_guesser_dir = os.path.join(indexdir, "label_guessing")
        mkdir_p(self.label_guesser_dir)

        self._docs_by_id = {}  # docid --> doc
        self.labels = {}  # label name --> label

        need_index_rewrite = True
        try:
            logger.info("Opening index dir '%s' ...", self.indexdir)
            self.index = whoosh.index.open_dir(self.indexdir)
            # check that the schema is up-to-date
            # We use the string representation of the schemas, because previous
            # versions of whoosh don't always implement __eq__
            if str(self.index.schema) == str(self.WHOOSH_SCHEMA):
                need_index_rewrite = False
        except (whoosh.index.EmptyIndexError, ValueError) as exc:
            logger.warning("Failed to open index '%s'", self.indexdir)
            logger.warning("Exception was: %s", exc)

        if need_index_rewrite:
            logger.info("Creating a new index")
            self.index = whoosh.index.create_in(self.indexdir,
                                                self.WHOOSH_SCHEMA)
            logger.info("Index '%s' created", self.indexdir)

        self.__searcher = self.index.searcher()

        class CustomFuzzy(whoosh.qparser.query.FuzzyTerm):
            def __init__(self, fieldname, text, boost=1.0, maxdist=1,
                         prefixlength=0, constantscore=True):
                whoosh.qparser.query.FuzzyTerm.__init__(
                    self, fieldname, text, boost, maxdist,
                    prefixlength, constantscore=True
                )

        facets = [
            whoosh.sorting.ScoreFacet(),
            whoosh.sorting.FieldFacet("date", reverse=True)
        ]

        self.search_param_list = {
            'fuzzy': [
                {
                    "query_parser": whoosh.qparser.MultifieldParser(
                        ["label", "content"], schema=self.index.schema,
                        termclass=CustomFuzzy),
                    "sortedby": facets
                },
                {
                    "query_parser": whoosh.qparser.MultifieldParser(
                        ["label", "content"], schema=self.index.schema,
                        termclass=whoosh.qparser.query.Prefix),
                    "sortedby": facets
                },
            ],
            'strict': [
                {
                    "query_parser": whoosh.qparser.MultifieldParser(
                        ["label", "content"], schema=self.index.schema,
                        termclass=whoosh.query.Term),
                    "sortedby": facets
                },
            ],
        }

        self.label_guesser = LabelGuesser(self.label_guesser_dir)

        self.check_workdir()
        self.reload_index(callback)