def main(): pconfig = config.PaperworkConfig() pconfig.read() src_dir = pconfig.settings['workdir'].value print("Source work directory : {}".format(src_dir)) src_dsearch = docsearch.DocSearch(src_dir) dst_doc_dir = tempfile.mkdtemp(suffix="paperwork-simulate-docs") dst_index_dir = tempfile.mkdtemp(suffix="paperwork-simulate-index") print("Destination directories : {} | {}".format(dst_doc_dir, dst_index_dir)) dst_dsearch = docsearch.DocSearch(dst_doc_dir, indexdir=dst_index_dir) try: documents = [x for x in src_dsearch.docs] documents.sort(key=lambda doc: doc.docid) for src_doc in documents: print("Document [{}]".format(src_doc.docid)) files = os.listdir(src_doc.path) files.sort() current_doc = None for filename in files: if "thumb" in filename: continue filepath = os.path.join(src_doc.path, filename) fileuri = "file://" + filepath importers = docimport.get_possible_importers( fileuri, current_doc=current_doc) if len(importers) <= 0: continue assert (len(importers) == 1) importer = importers[0] (docs, page, new) = importer.import_doc(fileuri, dst_dsearch, current_doc) dst_doc = docs[0] for page_nb in xrange(0, dst_doc.nb_pages): if dst_doc.can_edit: dst_doc.pages[page_nb].boxes = \ src_doc.pages[page_nb].boxes dst_doc.pages[page_nb].drop_cache() if current_doc is None: # first page --> guess labels and see if it matchs label_guess(dst_dsearch, src_doc, dst_doc) fix_labels(dst_dsearch, src_doc, dst_doc) else: # just update the index upd_index(dst_dsearch, dst_doc, new=False) current_doc = docs[0] finally: rm_rf(dst_doc_dir) rm_rf(dst_index_dir) print_stats()
def main(): pconfig = config.PaperworkConfig() pconfig.read() print("Opening docs (%s)" % pconfig.settings['workdir'].value) print("====================") dsearch = docsearch.DocSearch(pconfig.settings['workdir'].value) nb_words = 0 nb_docs = (len(dsearch.docs)) nb_pages = 0 max_pages = 0 total_word_len = 0 max_word_len = 0 words = set() total_nb_unique_words = 0 total_nb_unique_words_per_doc = 0 print("") print("Analysis") print("========") all_labels = set([l.name for l in dsearch.label_list]) label_keys = ['global', 'positive', 'negative'] # for the order total_label_accuracy = { 'global': 0, 'positive': 0, 'negative': 0, } total_labels = { 'global': 0, 'positive': 0, 'negative': 0, } for doc in dsearch.docs: sys.stdout.write(str(doc) + ": ") sys.stdout.flush() doc_words = set() if doc.nb_pages > max_pages: max_pages = doc.nb_pages # Keyword stats for page in doc.pages: sys.stdout.write("%d " % (page.page_nb + 1)) sys.stdout.flush() nb_pages += 1 for line in page.text: for word in util.split_words(line): # ignore words too short to be useful if (len(word) < 4): continue if word not in words: words.add(word) total_nb_unique_words += 1 if word not in doc_words: doc_words.add(word) total_nb_unique_words_per_doc += 1 nb_words += 1 total_word_len += len(word) if max_word_len < len(word): max_word_len = len(word) # Label predictions stats doc_labels = {l.name for l in doc.labels} predicated_labels = {l.name for l in dsearch.guess_labels(doc)} accurate = { 'global': 0, 'negative': 0, 'positive': 0, } nb_labels = { 'global': len(all_labels), 'positive': len(doc_labels), 'negative': len(all_labels) - len(doc_labels), } for key in label_keys: total_labels[key] += nb_labels[key] for label in all_labels: if not ((label in doc_labels) ^ (label in predicated_labels)): accurate['global'] += 1 total_label_accuracy['global'] += 1 if label in doc_labels: accurate['positive'] += 1 total_label_accuracy['positive'] += 1 else: accurate['negative'] += 1 total_label_accuracy['negative'] += 1 for key in label_keys: total = nb_labels[key] value = accurate[key] if total == 0: continue value = accurate[key] sys.stdout.write("\n\t- label prediction accuracy (%s): %d%%" % (key, (100 * accurate[key] / total))) sys.stdout.write("\n") print("") print("Statistics") print("==========") print("Total number of documents: %d" % nb_docs) print("Total number of pages: %d" % nb_pages) print("Total number of words: %d" % nb_words) print("Total words len: %d" % total_word_len) print("Total number of unique words: %d" % total_nb_unique_words) print("===") print("Maximum number of pages in one document: %d" % max_pages) print("Maximum word length: %d" % max_word_len) print("Average word length: %f" % (float(total_word_len) / float(nb_words))) print("Average number of words per page: %f" % (float(nb_words) / float(nb_pages))) print("Average number of words per document: %f" % (float(nb_words) / float(nb_docs))) print("Average number of pages per document: %f" % (float(nb_pages) / float(nb_docs))) print("Average number of unique words per document: %f" % (float(total_nb_unique_words_per_doc) / float(nb_docs))) for key in label_keys: total = total_labels[key] value = total_label_accuracy[key] print("Average accuracy of label prediction (%s): %d%%" % (key, (100 * value / total)))
def main(): print "Opening index" print "=============" pconfig = config.PaperworkConfig() dsearch = docsearch.DocSearch(pconfig.workdir) nb_words = 0 nb_docs = (len(dsearch.docs)) nb_pages = 0 total_word_len = 0 max_word_len = 0 words = set() total_nb_unique_words = 0 total_nb_unique_words_per_doc = 0 print "" print "Analysis" print "========" for doc in dsearch.docs: sys.stdout.write(str(doc) + ": ") sys.stdout.flush() doc_words = set() for page in doc.pages: sys.stdout.write("%d " % (page.page_nb + 1)) sys.stdout.flush() nb_pages += 1 for line in page.text: for word in util.split_words(line): # ignore words too short to be useful if (len(word) < 4): continue if not word in words: words.add(word) total_nb_unique_words += 1 if not word in doc_words: doc_words.add(word) total_nb_unique_words_per_doc += 1 nb_words += 1 total_word_len += len(word) if max_word_len < len(word): max_word_len = len(word) sys.stdout.write("\n") print "" print "Statistics" print "==========" print "Total number of documents: %d" % nb_docs print "Total number of pages: %d" % nb_pages print "Total number of words: %d" % total_word_len print "Total number of unique words: %d" % total_nb_unique_words print "===" print "Maximum word length: %d" % max_word_len print "Average word length: %f" % (float(total_word_len) / float(nb_words)) print("Average number of words per page: %f" % (float(nb_words) / float(nb_pages))) print("Average number of words per document: %f" % (float(nb_words) / float(nb_docs))) print("Average number of pages per document: %f" % (float(nb_pages) / float(nb_docs))) print("Average number of unique words per document: %f" % (float(total_nb_unique_words_per_doc) / float(nb_docs)))