示例#1
0
    def _get_highlighted_boxes(self, sentence):
        """
        Get all the boxes corresponding the given sentence

        Arguments:
            sentence --- can be string (will be splited), or an array of
                strings
        Returns:
            an array of boxes (see pyocr boxes)
        """
        if isinstance(sentence, unicode):
            keywords = split_words(sentence)
        else:
            assert(isinstance(sentence, list))
            keywords = sentence

        output = set()
        for keyword in keywords:
            for box in self.boxes["all"]:
                if keyword in box.content:
                    output.add(box)
                    continue
                # unfold generator output
                words = [x for x in split_words(box.content)]
                if keyword in words:
                    output.add(box)
                    continue
        return output
示例#2
0
文件: page.py 项目: smurfix/paperwork
 def __contains__(self, sentence):
     words = split_words(sentence)
     words = [word.lower() for word in words]
     txt = self.text
     for line in txt:
         line = line.lower()
         for word in words:
             if word in line:
                 return True
     return False
示例#3
0
文件: page.py 项目: smurfix/paperwork
    def __get_keywords(self):
        """
        Get all the keywords related of this page

        Returns:
            An array of strings
        """
        txt = self.text
        for line in txt:
            for word in split_words(line):
                yield (word)
示例#4
0
                logger.error("Unable to get boxes for '%s': %s" %
                             (self.doc.docid, exc))
                # will fall back on pdf boxes
        except OSError, exc:  # os.stat() failed
            pass

        # fall back on what libpoppler tells us

        # TODO: Line support !

        txt = self.pdf_page.get_text()
        pdf_size = self.pdf_page.get_size()
        words = set()
        self.__boxes = []
        for line in txt.split("\n"):
            for word in split_words(unicode(line, encoding='utf-8')):
                words.add(word)
        for word in words:
            for rect in self.pdf_page.find_text(word):
                word_box = PdfWordBox(word, rect, pdf_size)
                line_box = PdfLineBox([word_box], rect, pdf_size)
                self.__boxes.append(line_box)
        return self.__boxes

    def __set_boxes(self, boxes):
        boxfile = self.__get_box_path()
        with codecs.open(boxfile, 'w', encoding='utf-8') as file_desc:
            pyocr.builders.LineBoxBuilder().write_file(file_desc, boxes)
        self.drop_cache()
        self.doc.drop_cache()
示例#5
0
def main():
    pconfig = config.PaperworkConfig()
    pconfig.read()
    print("Opening docs (%s)" % pconfig.settings['workdir'].value)
    print("====================")
    dsearch = docsearch.DocSearch(pconfig.settings['workdir'].value)

    nb_words = 0
    nb_docs = (len(dsearch.docs))
    nb_pages = 0
    max_pages = 0

    total_word_len = 0
    max_word_len = 0

    words = set()
    total_nb_unique_words = 0
    total_nb_unique_words_per_doc = 0

    print("")
    print("Analysis")
    print("========")

    all_labels = set([l.name for l in dsearch.label_list])
    label_keys = ['global', 'positive', 'negative']  # for the order
    total_label_accuracy = {
        'global': 0,
        'positive': 0,
        'negative': 0,
    }
    total_labels = {
        'global': 0,
        'positive': 0,
        'negative': 0,
    }

    for doc in dsearch.docs:
        sys.stdout.write(str(doc) + ": ")
        sys.stdout.flush()

        doc_words = set()

        if doc.nb_pages > max_pages:
            max_pages = doc.nb_pages

        # Keyword stats
        for page in doc.pages:
            sys.stdout.write("%d " % (page.page_nb + 1))
            sys.stdout.flush()
            nb_pages += 1

            for line in page.text:
                for word in util.split_words(line):
                    # ignore words too short to be useful
                    if (len(word) < 4):
                        continue
                    if word not in words:
                        words.add(word)
                        total_nb_unique_words += 1
                    if word not in doc_words:
                        doc_words.add(word)
                        total_nb_unique_words_per_doc += 1

                    nb_words += 1
                    total_word_len += len(word)
                    if max_word_len < len(word):
                        max_word_len = len(word)

        # Label predictions stats
        doc_labels = {l.name for l in doc.labels}
        predicated_labels = {l.name for l in dsearch.guess_labels(doc)}
        accurate = {
            'global': 0,
            'negative': 0,
            'positive': 0,
        }
        nb_labels = {
            'global': len(all_labels),
            'positive': len(doc_labels),
            'negative': len(all_labels) - len(doc_labels),
        }
        for key in label_keys:
            total_labels[key] += nb_labels[key]
        for label in all_labels:
            if not ((label in doc_labels) ^ (label in predicated_labels)):
                accurate['global'] += 1
                total_label_accuracy['global'] += 1
                if label in doc_labels:
                    accurate['positive'] += 1
                    total_label_accuracy['positive'] += 1
                else:
                    accurate['negative'] += 1
                    total_label_accuracy['negative'] += 1
        for key in label_keys:
            total = nb_labels[key]
            value = accurate[key]
            if total == 0:
                continue
            value = accurate[key]
            sys.stdout.write("\n\t- label prediction accuracy (%s): %d%%" %
                             (key, (100 * accurate[key] / total)))

        sys.stdout.write("\n")

    print("")
    print("Statistics")
    print("==========")
    print("Total number of documents: %d" % nb_docs)
    print("Total number of pages: %d" % nb_pages)
    print("Total number of words: %d" % nb_words)
    print("Total words len: %d" % total_word_len)
    print("Total number of unique words: %d" % total_nb_unique_words)
    print("===")
    print("Maximum number of pages in one document: %d" % max_pages)
    print("Maximum word length: %d" % max_word_len)
    print("Average word length: %f" %
          (float(total_word_len) / float(nb_words)))
    print("Average number of words per page: %f" %
          (float(nb_words) / float(nb_pages)))
    print("Average number of words per document: %f" %
          (float(nb_words) / float(nb_docs)))
    print("Average number of pages per document: %f" %
          (float(nb_pages) / float(nb_docs)))
    print("Average number of unique words per document: %f" %
          (float(total_nb_unique_words_per_doc) / float(nb_docs)))
    for key in label_keys:
        total = total_labels[key]
        value = total_label_accuracy[key]
        print("Average accuracy of label prediction (%s): %d%%" %
              (key, (100 * value / total)))