예제 #1
0
def main():
    pconfig = config.PaperworkConfig()
    pconfig.read()

    src_dir = pconfig.settings['workdir'].value
    print("Source work directory : {}".format(src_dir))
    src_dsearch = docsearch.DocSearch(src_dir)

    dst_doc_dir = tempfile.mkdtemp(suffix="paperwork-simulate-docs")
    dst_index_dir = tempfile.mkdtemp(suffix="paperwork-simulate-index")
    print("Destination directories : {} | {}".format(dst_doc_dir,
                                                     dst_index_dir))
    dst_dsearch = docsearch.DocSearch(dst_doc_dir, indexdir=dst_index_dir)

    try:
        documents = [x for x in src_dsearch.docs]
        documents.sort(key=lambda doc: doc.docid)

        for src_doc in documents:
            print("Document [{}]".format(src_doc.docid))
            files = os.listdir(src_doc.path)
            files.sort()

            current_doc = None
            for filename in files:
                if "thumb" in filename:
                    continue
                filepath = os.path.join(src_doc.path, filename)
                fileuri = "file://" + filepath
                importers = docimport.get_possible_importers(
                    fileuri, current_doc=current_doc)
                if len(importers) <= 0:
                    continue
                assert (len(importers) == 1)
                importer = importers[0]
                (docs, page,
                 new) = importer.import_doc(fileuri, dst_dsearch, current_doc)
                dst_doc = docs[0]

                for page_nb in xrange(0, dst_doc.nb_pages):
                    if dst_doc.can_edit:
                        dst_doc.pages[page_nb].boxes = \
                            src_doc.pages[page_nb].boxes
                        dst_doc.pages[page_nb].drop_cache()

                if current_doc is None:
                    # first page --> guess labels and see if it matchs
                    label_guess(dst_dsearch, src_doc, dst_doc)
                    fix_labels(dst_dsearch, src_doc, dst_doc)
                else:
                    # just update the index
                    upd_index(dst_dsearch, dst_doc, new=False)

                current_doc = docs[0]

    finally:
        rm_rf(dst_doc_dir)
        rm_rf(dst_index_dir)
        print_stats()
예제 #2
0
def main():
    pconfig = config.PaperworkConfig()
    pconfig.read()
    print("Opening docs (%s)" % pconfig.settings['workdir'].value)
    print("====================")
    dsearch = docsearch.DocSearch(pconfig.settings['workdir'].value)

    nb_words = 0
    nb_docs = (len(dsearch.docs))
    nb_pages = 0
    max_pages = 0

    total_word_len = 0
    max_word_len = 0

    words = set()
    total_nb_unique_words = 0
    total_nb_unique_words_per_doc = 0

    print("")
    print("Analysis")
    print("========")

    all_labels = set([l.name for l in dsearch.label_list])
    label_keys = ['global', 'positive', 'negative']  # for the order
    total_label_accuracy = {
        'global': 0,
        'positive': 0,
        'negative': 0,
    }
    total_labels = {
        'global': 0,
        'positive': 0,
        'negative': 0,
    }

    for doc in dsearch.docs:
        sys.stdout.write(str(doc) + ": ")
        sys.stdout.flush()

        doc_words = set()

        if doc.nb_pages > max_pages:
            max_pages = doc.nb_pages

        # Keyword stats
        for page in doc.pages:
            sys.stdout.write("%d " % (page.page_nb + 1))
            sys.stdout.flush()
            nb_pages += 1

            for line in page.text:
                for word in util.split_words(line):
                    # ignore words too short to be useful
                    if (len(word) < 4):
                        continue
                    if word not in words:
                        words.add(word)
                        total_nb_unique_words += 1
                    if word not in doc_words:
                        doc_words.add(word)
                        total_nb_unique_words_per_doc += 1

                    nb_words += 1
                    total_word_len += len(word)
                    if max_word_len < len(word):
                        max_word_len = len(word)

        # Label predictions stats
        doc_labels = {l.name for l in doc.labels}
        predicated_labels = {l.name for l in dsearch.guess_labels(doc)}
        accurate = {
            'global': 0,
            'negative': 0,
            'positive': 0,
        }
        nb_labels = {
            'global': len(all_labels),
            'positive': len(doc_labels),
            'negative': len(all_labels) - len(doc_labels),
        }
        for key in label_keys:
            total_labels[key] += nb_labels[key]
        for label in all_labels:
            if not ((label in doc_labels) ^ (label in predicated_labels)):
                accurate['global'] += 1
                total_label_accuracy['global'] += 1
                if label in doc_labels:
                    accurate['positive'] += 1
                    total_label_accuracy['positive'] += 1
                else:
                    accurate['negative'] += 1
                    total_label_accuracy['negative'] += 1
        for key in label_keys:
            total = nb_labels[key]
            value = accurate[key]
            if total == 0:
                continue
            value = accurate[key]
            sys.stdout.write("\n\t- label prediction accuracy (%s): %d%%" %
                             (key, (100 * accurate[key] / total)))

        sys.stdout.write("\n")

    print("")
    print("Statistics")
    print("==========")
    print("Total number of documents: %d" % nb_docs)
    print("Total number of pages: %d" % nb_pages)
    print("Total number of words: %d" % nb_words)
    print("Total words len: %d" % total_word_len)
    print("Total number of unique words: %d" % total_nb_unique_words)
    print("===")
    print("Maximum number of pages in one document: %d" % max_pages)
    print("Maximum word length: %d" % max_word_len)
    print("Average word length: %f" %
          (float(total_word_len) / float(nb_words)))
    print("Average number of words per page: %f" %
          (float(nb_words) / float(nb_pages)))
    print("Average number of words per document: %f" %
          (float(nb_words) / float(nb_docs)))
    print("Average number of pages per document: %f" %
          (float(nb_pages) / float(nb_docs)))
    print("Average number of unique words per document: %f" %
          (float(total_nb_unique_words_per_doc) / float(nb_docs)))
    for key in label_keys:
        total = total_labels[key]
        value = total_label_accuracy[key]
        print("Average accuracy of label prediction (%s): %d%%" %
              (key, (100 * value / total)))
예제 #3
0
파일: stats.py 프로젝트: jaesivsm/paperwork
def main():
    print "Opening index"
    print "============="
    pconfig = config.PaperworkConfig()
    dsearch = docsearch.DocSearch(pconfig.workdir)

    nb_words = 0
    nb_docs = (len(dsearch.docs))
    nb_pages = 0

    total_word_len = 0
    max_word_len = 0

    words = set()
    total_nb_unique_words = 0
    total_nb_unique_words_per_doc = 0

    print ""
    print "Analysis"
    print "========"

    for doc in dsearch.docs:
        sys.stdout.write(str(doc) + ": ")
        sys.stdout.flush()

        doc_words = set()

        for page in doc.pages:
            sys.stdout.write("%d " % (page.page_nb + 1))
            sys.stdout.flush()
            nb_pages += 1

            for line in page.text:
                for word in util.split_words(line):
                    # ignore words too short to be useful
                    if (len(word) < 4):
                        continue
                    if not word in words:
                        words.add(word)
                        total_nb_unique_words += 1
                    if not word in doc_words:
                        doc_words.add(word)
                        total_nb_unique_words_per_doc += 1

                    nb_words += 1
                    total_word_len += len(word)
                    if max_word_len < len(word):
                        max_word_len = len(word)

        sys.stdout.write("\n")

    print ""
    print "Statistics"
    print "=========="
    print "Total number of documents: %d" % nb_docs
    print "Total number of pages: %d" % nb_pages
    print "Total number of words: %d" % total_word_len
    print "Total number of unique words: %d" % total_nb_unique_words
    print "==="
    print "Maximum word length: %d" % max_word_len
    print "Average word length: %f" % (float(total_word_len) / float(nb_words))
    print("Average number of words per page: %f" %
          (float(nb_words) / float(nb_pages)))
    print("Average number of words per document: %f" %
          (float(nb_words) / float(nb_docs)))
    print("Average number of pages per document: %f" %
          (float(nb_pages) / float(nb_docs)))
    print("Average number of unique words per document: %f" %
          (float(total_nb_unique_words_per_doc) / float(nb_docs)))