示例#1
0
    def init_docsearch(self):
        paperwork.setup_test_env()

        new_docs = set()
        upd_docs = set()
        missing_docs = set()

        config = PaperworkConfig()
        config.read()

        # start from scratch
        dsearch = docsearch.DocSearch(config['workdir'].value)
        dsearch.destroy_index()

        dsearch = docsearch.DocSearch(config['workdir'].value)
        dsearch.reload_index()

        doc_examiner = dsearch.get_doc_examiner()
        doc_examiner.examine_rootdir(
            lambda x: new_docs.add(x),
            lambda x: upd_docs.add(x),
            lambda x: missing_docs.add(x),
            lambda x: None,
        )

        assert (len(upd_docs) <= 0)
        assert (len(missing_docs) <= 0)

        index_updater = dsearch.get_index_updater()
        for doc in new_docs:
            index_updater.add_doc(doc)
        index_updater.commit()

        return dsearch
def main():
    if len(sys.argv) < 3:
        print("Syntax:")
        print("  {} [min_yeses] [out_csv_file]".format(sys.argv[0]))
        sys.exit(1)

    min_yeses = eval(sys.argv[1])
    out_csv_file = sys.argv[2]

    pconfig = config.PaperworkConfig()
    pconfig.read()

    src_dir = pconfig.settings['workdir'].value
    print("Source work directory : {}".format(src_dir))
    src_dsearch = docsearch.DocSearch(src_dir)
    src_dsearch.reload_index()

    nb_threads = multiprocessing.cpu_count()
    pool = multiprocessing.pool.ThreadPool(processes=nb_threads)

    with open(out_csv_file, 'a', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        for min_yes in min_yeses:
            pool.apply_async(_run_simulation, (
                src_dsearch,
                min_yes,
                csvwriter,
            ))
        pool.close()
        pool.join()
    print("All done !")
示例#3
0
def main():
    global g_lang
    global g_dictionnary
    global g_tknzr
    global g_nb_total_pages
    global g_start_time

    print("Will use {} for OCR".format(OCR_TOOL.get_name()))

    print("Initializing dictionnary ...")
    g_lang = "eng"
    if len(sys.argv) > 1:
        g_lang = "fra"

    g_dictionnary = enchant.request_dict(g_lang[:2])
    try:
        g_tknzr = enchant.tokenize.get_tokenizer(g_lang[:2])
    except enchant.tokenize.TokenizerNotFoundError as exc:
        print("Warning: Falling back to default tokenizer ({})".format(exc))
        g_tknzr = enchant.tokenize.get_tokenizer()
    print("Done")

    print("Loading documents list ...")
    pconfig = config.PaperworkConfig()
    pconfig.read()
    work_dir = pconfig.settings['workdir'].value
    dsearch = docsearch.DocSearch(work_dir)
    dsearch.reload_index()
    print("Documents loaded")
    print("")

    print("Initalizing workers ...")
    manager = WorkerManager()
    manager.start()

    factory = JobFactoryImageProcessing()
    print("Done")

    g_start_time = datetime.datetime.now()

    try:
        print("Queueing jobs ...")
        nb_docs = 0
        nb_pages = 0
        for doc in dsearch.docs:
            if not doc.can_edit:  # probably not an OCR-ized doc
                continue
            nb_docs += 1
            for page in doc.pages:
                if not page.can_edit:  # probably not an OCR-ized page
                    continue
                nb_pages += 1
                g_nb_total_pages += 1
                for algos in ALGORITHMS:
                    job = factory.make(page, algos)
                    manager.schedule(job)

        print("Queued jobs : {} docs | {} pages".format(nb_docs, nb_pages))

        manager.wait_for_all()
    finally:
        manager.stop()
示例#4
0
def main():
    pconfig = config.PaperworkConfig()
    pconfig.read()

    src_dir = pconfig.settings['workdir'].value
    print("Source work directory : {}".format(src_dir))
    src_dsearch = docsearch.DocSearch(src_dir)
    src_dsearch.reload_index()

    dst_doc_dir = tempfile.mkdtemp(suffix="paperwork-simulate-docs")
    dst_index_dir = tempfile.mkdtemp(suffix="paperwork-simulate-index")
    print("Destination directories : {} | {}".format(dst_doc_dir,
                                                     dst_index_dir))
    dst_dsearch = docsearch.DocSearch(dst_doc_dir, indexdir=dst_index_dir)
    dst_dsearch.reload_index()

    try:
        documents = [x for x in src_dsearch.docs]
        documents.sort(key=lambda doc: doc.docid)

        for src_doc in documents:
            print("Document [{}]".format(src_doc.docid))
            files = os.listdir(src_doc.path)
            files.sort()

            current_doc = None
            dst_doc = None
            for filename in files:
                if "thumb" in filename:
                    continue
                filepath = os.path.join(src_doc.path, filename)
                fileuri = "file://" + filepath
                importers = docimport.get_possible_importers(
                    fileuri, current_doc=current_doc)
                if len(importers) <= 0:
                    continue
                assert (len(importers) == 1)
                importer = importers[0]
                (docs, page,
                 new) = importer.import_doc(fileuri, dst_dsearch, current_doc)
                dst_doc = docs[0]

                for page_nb in range(0, dst_doc.nb_pages):
                    if dst_doc.can_edit:
                        dst_doc.pages[page_nb].boxes = \
                            src_doc.pages[page_nb].boxes
                        dst_doc.pages[page_nb].drop_cache()

                if current_doc is None:
                    # first page --> guess labels and see if it matchs
                    label_guess(dst_dsearch, src_doc, dst_doc)
                else:
                    # just update the index
                    upd_index(dst_dsearch, dst_doc, new=False)

                current_doc = docs[0]

            if dst_doc is not None:
                fix_labels(dst_dsearch, src_doc, dst_doc)

    finally:
        rm_rf(dst_doc_dir)
        rm_rf(dst_index_dir)
        print_stats()
示例#5
0
def main():
    pconfig = config.PaperworkConfig()
    pconfig.read()
    print("Opening docs (%s)" % pconfig.settings['workdir'].value)
    print("====================")
    dsearch = docsearch.DocSearch(pconfig.settings['workdir'].value)
    dsearch.reload_index()

    nb_words = 0
    nb_docs = (len(dsearch.docs))
    nb_pages = 0
    max_pages = 0

    total_word_len = 0
    max_word_len = 0

    words = set()
    total_nb_unique_words = 0
    total_nb_unique_words_per_doc = 0

    print("")
    print("Analysis")
    print("========")

    all_labels = set([l.name for l in dsearch.label_list])
    label_keys = ['exact', 'global', 'positive', 'negative']  # for the order
    total_label_accuracy = {
        'exact': 0,
        'global': 0,
        'positive': 0,
        'negative': 0,
    }
    total_labels = {
        'exact': 0,
        'global': 0,
        'positive': 0,
        'negative': 0,
    }

    for doc in sorted(dsearch.docs, key=lambda x: x.docid):
        sys.stdout.write(str(doc) + ": ")
        sys.stdout.flush()

        doc_words = set()

        if doc.nb_pages > max_pages:
            max_pages = doc.nb_pages

        # Keyword stats
        for page in doc.pages:
            sys.stdout.write("%d " % (page.page_nb + 1))
            sys.stdout.flush()
            nb_pages += 1

            for line in page.text:
                for word in util.split_words(line):
                    # ignore words too short to be useful
                    if (len(word) < 4):
                        continue
                    if word not in words:
                        words.add(word)
                        total_nb_unique_words += 1
                    if word not in doc_words:
                        doc_words.add(word)
                        total_nb_unique_words_per_doc += 1

                    nb_words += 1
                    total_word_len += len(word)
                    if max_word_len < len(word):
                        max_word_len = len(word)

        # Label predictions stats
        doc_labels = {l.name for l in doc.labels}
        predicted_labels = {
            l.name
            for (l, scores) in dsearch.guess_labels(doc)
        }
        accurate = {
            'exact': 1,
            'global': 0,
            'negative': 0,
            'positive': 0,
        }
        nb_labels = {
            'exact': 1,
            'global': len(all_labels),
            'positive': len(doc_labels),
            'negative': len(all_labels) - len(doc_labels),
        }
        for key in label_keys:
            total_labels[key] += nb_labels[key]
        missing = []
        for label in all_labels:
            if not ((label in doc_labels) ^ (label in predicted_labels)):
                accurate['global'] += 1
                total_label_accuracy['global'] += 1
                if label in doc_labels:
                    accurate['positive'] += 1
                    total_label_accuracy['positive'] += 1
                else:
                    accurate['negative'] += 1
                    total_label_accuracy['negative'] += 1
            else:
                if label in predicted_labels:
                    missing.append(label)
                accurate['exact'] = 0
        if accurate['exact']:
            total_label_accuracy['exact'] += 1
        for key in label_keys:
            total = nb_labels[key]
            value = accurate[key]
            if total == 0:
                continue
            value = accurate[key]
            sys.stdout.write("\n\t- label prediction accuracy (%s): %d%%" %
                             (key, (100 * accurate[key] / total)))
        sys.stdout.write("\n")
        for missing_label in missing:
            sys.stdout.write("Missing: {}\n".format(missing_label))

    print("")
    print("Statistics")
    print("==========")
    print("Total number of documents: %d" % nb_docs)
    print("Total number of pages: %d" % nb_pages)
    print("Total number of words: %d" % nb_words)
    print("Total words len: %d" % total_word_len)
    print("Total number of unique words: %d" % total_nb_unique_words)
    print("===")
    print("Maximum number of pages in one document: %d" % max_pages)
    print("Maximum word length: %d" % max_word_len)
    print("Average word length: %f" %
          (float(total_word_len) / float(nb_words)))
    print("Average number of words per page: %f" %
          (float(nb_words) / float(nb_pages)))
    print("Average number of words per document: %f" %
          (float(nb_words) / float(nb_docs)))
    print("Average number of pages per document: %f" %
          (float(nb_pages) / float(nb_docs)))
    print("Average number of unique words per document: %f" %
          (float(total_nb_unique_words_per_doc) / float(nb_docs)))
    for key in label_keys:
        total = total_labels[key]
        value = total_label_accuracy[key]
        print("Average accuracy of label prediction (%s): %f%%" %
              (key, (100 * value / total)))
def run_simulation(src_dsearch, min_yes, csvwriter):
    stats = {
        'nb_documents': 0,
        'correct_guess': 0,
        'missing_guess': 0,
        'wrong_guess': 0,
        'nb_src_labels': 0,
        'nb_dst_labels': 0,
        'perfect': 0,
    }

    dst_doc_dir = tempfile.mkdtemp(suffix="paperwork-simulate-docs")
    dst_index_dir = tempfile.mkdtemp(suffix="paperwork-simulate-index")
    print("Destination directories : {} | {}".format(dst_doc_dir,
                                                     dst_index_dir))
    dst_dsearch = docsearch.DocSearch(dst_doc_dir, indexdir=dst_index_dir)
    dst_dsearch.reload_index()

    dst_dsearch.label_guesser.min_yes = min_yes

    try:
        documents = [x for x in src_dsearch.docs]
        documents.sort(key=lambda doc: doc.docid)

        for src_doc in documents:
            files = os.listdir(src_doc.path)
            files.sort()

            current_doc = None
            for filename in files:
                if "thumb" in filename:
                    continue
                filepath = os.path.join(src_doc.path, filename)
                fileuri = "file://" + filepath
                importers = docimport.get_possible_importers(
                    fileuri, current_doc=current_doc)
                if len(importers) <= 0:
                    continue
                assert (len(importers) == 1)
                importer = importers[0]
                (docs, page,
                 new) = importer.import_doc(fileuri, dst_dsearch, current_doc)
                dst_doc = docs[0]

                for page_nb in range(0, dst_doc.nb_pages):
                    if dst_doc.can_edit:
                        dst_doc.pages[page_nb].boxes = \
                            src_doc.pages[page_nb].boxes
                        dst_doc.pages[page_nb].drop_cache()

                if current_doc is None:
                    # first page --> guess labels and see if it matchs
                    label_guess(dst_dsearch, src_doc, dst_doc)
                    fix_labels(stats, dst_dsearch, src_doc, dst_doc)
                else:
                    # just update the index
                    upd_index(dst_dsearch, dst_doc, new=False)

                current_doc = docs[0]
    finally:
        g_lock.acquire()
        try:
            csvwriter.writerow([
                min_yes,
                stats['nb_documents'],
                stats['perfect'],
            ])
        finally:
            g_lock.release()
        rm_rf(dst_doc_dir)
        rm_rf(dst_index_dir)
        print_stats(stats)
示例#7
0
def main():
    # enable_logging()
    pconfig = config.PaperworkConfig()
    pconfig.read()

    src_dir = pconfig.settings['workdir'].value
    print("Source work directory : {}".format(src_dir))
    src_dsearch = docsearch.DocSearch(src_dir, use_default_index_client=False)
    src_dsearch.reload_index()

    dst_doc_dir = tempfile.mkdtemp(suffix="paperwork-simulate-docs")
    dst_index_dir = tempfile.mkdtemp(suffix="paperwork-simulate-index")
    print("Destination directories : {} | {}".format(dst_doc_dir,
                                                     dst_index_dir))
    dst_dsearch = docsearch.DocSearch(dst_doc_dir,
                                      indexdir=dst_index_dir,
                                      use_default_index_client=False)
    dst_dsearch.reload_index()

    print("Testing ...")

    try:
        documents = [x for x in src_dsearch.docs]
        documents.sort(key=lambda doc: doc.docid)

        print("Number of documents: {}".format(len(documents)))

        for src_doc in documents:
            print("Document [{}] | [{}]".format(src_doc.docid, src_doc.path))
            files = [x for x in g_fs.listdir(src_doc.path)]
            files.sort()

            current_doc = None
            for filepath in files:
                print("File: {}".format(filepath))
                filename = g_fs.basename(filepath)
                if "thumb" in filename or "labels" == filename:
                    continue
                importers = docimport.get_possible_importers(
                    [filepath], current_doc=current_doc)
                if len(importers) <= 0:
                    continue
                print("Importer(s): {}".format(", ".join(
                    [str(x) for x in importers])))
                assert (len(importers) == 1)
                importer = importers[0]
                result = importer.import_doc([filepath], dst_dsearch,
                                             current_doc)
                print("Import result: {}".format(str(result.get())))
                if current_doc is None:
                    if result.new_docs == []:
                        print("Nothing imported ?!")
                        continue
                    dst_doc = result.new_docs[0]
                else:
                    dst_doc = current_doc

                for page_nb in range(0, dst_doc.nb_pages):
                    if dst_doc.can_edit:
                        dst_doc.pages[page_nb].boxes = \
                            src_doc.pages[page_nb].boxes

                if current_doc is None:
                    # first page --> guess labels and see if it matchs
                    label_guess(dst_dsearch, src_doc, dst_doc)
                    fix_labels(dst_dsearch, src_doc, dst_doc)
                else:
                    # just update the index
                    upd_index(dst_dsearch, dst_doc, new=False)

                current_doc = dst_doc
            print("")

    finally:
        print("---")
        rm_rf(dst_doc_dir)
        rm_rf(dst_index_dir)
        print_stats()