Exemplo n.º 1
0
    def get_boxes(self, sentence):
        """
        Get all the boxes corresponding the given sentence

        Arguments:
            sentence --- can be string (will be splited), or an array of strings
        Returns:
            an array of boxes (see pyocr boxes)
        """
        if isinstance(sentence, unicode):
            keywords = split_words(sentence)
        else:
            assert(isinstance(sentence, list))
            keywords = sentence

        output = []
        for keyword in keywords:
            for box in self.boxes:
                # unfold generator output
                words = []
                for word in split_words(box.content):
                    words.append(word)
                if keyword in words:
                    output.append(box)
        return output
Exemplo n.º 2
0
    def get_boxes(self, sentence):
        """
        Get all the boxes corresponding the given sentence

        Arguments:
            sentence --- can be string (will be splited), or an array of strings
        Returns:
            an array of boxes (see pyocr boxes)
        """
        if isinstance(sentence, unicode):
            keywords = split_words(sentence)
        else:
            assert (isinstance(sentence, list))
            keywords = sentence

        output = []
        for keyword in keywords:
            for line in self.boxes:
                for box in line.word_boxes:
                    if keyword in box.content:
                        output.append(box)
                        continue
                    # unfold generator output
                    words = [x for x in split_words(box.content)]
                    if keyword in words:
                        output.append(box)
                        continue
        return output
Exemplo n.º 3
0
    def find_documents(self, sentence):
        """
        Returns all the documents matching the given keywords

        Arguments:
            keywords --- keywords (single string)

        Returns:
            An array of document id (strings)
        """

        if sentence.strip() == "":
            return self.docs[:]

        positive_keywords = []
        negative_keywords = []

        print ("Looking for documents containing %s"
               % (sentence.encode('ascii', 'replace')))

        for keyword in split_words(sentence):
            if keyword[:1] != "!":
                positive_keywords.append(keyword)
            else:
                negative_keywords.append(keyword[1:])

        if (len(positive_keywords) == 0 and len(negative_keywords) == 0):
            return []

        documents = None

        if len(positive_keywords) <= 0:
            positive_keywords = ["*"]

        for keyword in positive_keywords:
            docs = self.__find_documents(keyword)
            if documents == None:
                documents = docs
            else:
                # intersection of both arrays
                documents = [val for val in documents if val in docs]

        if documents == None:
            return []

        print "Found %d documents" % (len(documents))

        for keyword in negative_keywords:
            docs = self.__find_documents(keyword)
            print "Found %d documents to remove" % (len(documents))
            for doc in docs:
                try:
                    documents.remove(doc)
                except ValueError:
                    pass

        documents.sort()
        return documents
Exemplo n.º 4
0
 def __contains__(self, sentence):
     words = split_words(sentence)
     words = [word.lower() for word in words]
     txt = self.text
     for line in txt:
         line = line.lower()
         for word in words:
             if word in line:
                 return True
     return False
Exemplo n.º 5
0
    def __get_keywords(self):
        """
        Get all the keywords related of this page

        Returns:
            An array of strings
        """
        for line in self.text:
            for word in split_words(line):
                yield (word)
Exemplo n.º 6
0
 def __contains__(self, sentence):
     words = split_words(sentence)
     words = [word.lower() for word in words]
     txt = self.text
     for line in txt:
         line = line.lower()
         for word in words:
             if word in line:
                 return True
     return False
Exemplo n.º 7
0
    def __get_keywords(self):
        """
        Get all the keywords related of this page

        Returns:
            An array of strings
        """
        for line in self.text:
            for word in split_words(line):
                yield(word)
Exemplo n.º 8
0
    def find_documents(self, sentence):
        """
        Returns all the documents matching the given keywords

        Arguments:
            keywords --- keywords (single string)

        Returns:
            An array of document id (strings)
        """

        if sentence.strip() == "":
            return self.docs[:]

        positive_keywords = []
        negative_keywords = []

        print("Looking for documents containing %s" %
              (sentence.encode('ascii', 'replace')))

        for keyword in split_words(sentence):
            if keyword[:1] != "!":
                positive_keywords.append(keyword)
            else:
                negative_keywords.append(keyword[1:])

        if (len(positive_keywords) == 0 and len(negative_keywords) == 0):
            return []

        documents = None

        if len(positive_keywords) <= 0:
            positive_keywords = ["*"]

        for keyword in positive_keywords:
            docs = self.__find_documents(keyword)
            if documents == None:
                documents = docs
            else:
                documents.intersection_update(docs)

        if documents == None:
            return []

        print "Found %d documents" % (len(documents))

        for keyword in negative_keywords:
            docs = self.__find_documents(keyword)
            print "Found %d documents to remove" % (len(documents))
            documents.symmetric_difference_update(docs)

        documents = list(documents)
        documents.sort()
        return documents
Exemplo n.º 9
0
    def find_documents(self, sentence):
        """
        Returns all the documents matching the given keywords

        Arguments:
            keywords --- keywords (single string)

        Returns:
            An array of document id (strings)
        """

        if sentence.strip() == "":
            return self.docs[:]

        positive_keywords = []
        negative_keywords = []

        print ("Looking for documents containing %s"
               % (sentence.encode('ascii', 'replace')))

        for keyword in split_words(sentence):
            if keyword[:1] != "!":
                positive_keywords.append(keyword)
            else:
                negative_keywords.append(keyword[1:])

        if (len(positive_keywords) == 0 and len(negative_keywords) == 0):
            return []

        documents = None

        if len(positive_keywords) <= 0:
            positive_keywords = ["*"]

        for keyword in positive_keywords:
            docs = self.__find_documents(keyword)
            if documents == None:
                documents = docs
            else:
                documents.intersection_update(docs)

        if documents == None:
            return []

        print "Found %d documents" % (len(documents))

        for keyword in negative_keywords:
            docs = self.__find_documents(keyword)
            print "Found %d documents to remove" % (len(documents))
            documents.symmetric_difference_update(docs)

        documents = list(documents)
        documents.sort()
        return documents
Exemplo n.º 10
0
    def add_label(self, label, doc):
        """
        Add a new label to the list of known labels.

        Arguments:
            label --- The new label (see labels.Label)
            doc --- The first document on which this label has been added
        """
        label_words = split_words(label.name)
        for word in label_words:
            self.__index_keyword(doc, word)
            if not word in self.__keywords:
                self.__keywords.append(word)
                self.__keywords.sort()
        if not label in self.label_list:
            self.label_list.append(label)
            self.label_list.sort()
Exemplo n.º 11
0
    def add_label(self, label, doc):
        """
        Add a new label to the list of known labels.

        Arguments:
            label --- The new label (see labels.Label)
            doc --- The first document on which this label has been added
        """
        label_words = split_words(label.name)
        for word in label_words:
            if not word in self.__keyword_to_docs:
                self.__keywords.append(word)
                self.__keywords.sort()
                self.__keyword_to_docs[word] = set([doc])
            else:
                self.__keyword_to_docs[word].add(doc)
        if not label in self.label_list:
            self.label_list.append(label)
            self.label_list.sort()
Exemplo n.º 12
0
    def find_suggestions(self, sentence):
        """
        Search all possible suggestions. Suggestions returned always have at
        least one document matching.

        Arguments:
            sentence --- keywords (single strings) for which we want
                suggestions
        Return:
            An array of sets of keywords. Each set of keywords (-> one string)
            is a suggestion.
        """
        keywords = split_words(sentence)
        results = self.__find_suggestions(self.__unfold_generator(keywords))
        try:
            results.remove(sentence)    # remove strict match if it is here
        except ValueError:
            pass
        results.sort()
        return results
Exemplo n.º 13
0
    def find_suggestions(self, sentence):
        """
        Search all possible suggestions. Suggestions returned always have at
        least one document matching.

        Arguments:
            sentence --- keywords (single strings) for which we want
                suggestions
        Return:
            An array of sets of keywords. Each set of keywords (-> one string)
            is a suggestion.
        """
        keywords = split_words(sentence)
        results = self.__find_suggestions([x for x in keywords])
        try:
            results.remove(sentence)  # remove strict match if it is here
        except ValueError:
            pass
        results.sort()
        return results
Exemplo n.º 14
0
                logger.error("Unable to get boxes for '%s': %s"
                       % (self.doc.docid, exc))
                # will fall back on pdf boxes
        except OSError, exc:  # os.stat() failed
            pass

        # fall back on what libpoppler tells us

        # TODO: Line support !

        txt = self.pdf_page.get_text()
        pdf_size = self.pdf_page.get_size()
        words = set()
        self.__boxes = []
        for line in txt.split("\n"):
            for word in split_words(unicode(line, encoding='utf-8')):
                words.add(word)
        for word in words:
            for rect in self.pdf_page.find_text(word):
                word_box = PdfWordBox(word, rect, pdf_size)
                line_box = PdfLineBox([word_box], rect, pdf_size)
                self.__boxes.append(line_box)
        return self.__boxes

    boxes = property(__get_boxes)

    def __render_img(self, factor):
        # TODO(Jflesch): In a perfect world, we shouldn't use ImageSurface.
        # we should draw directly on the GtkImage.window.cairo_create()
        # context. It would be much more efficient.
Exemplo n.º 15
0
def main():
    pconfig = config.PaperworkConfig()
    pconfig.read()
    print("Opening docs (%s)" % pconfig.workdir)
    print("====================")
    dsearch = docsearch.DocSearch(pconfig.workdir)

    nb_words = 0
    nb_docs = (len(dsearch.docs))
    nb_pages = 0
    max_pages = 0

    total_word_len = 0
    max_word_len = 0

    words = set()
    total_nb_unique_words = 0
    total_nb_unique_words_per_doc = 0

    print("")
    print("Analysis")
    print("========")

    for doc in dsearch.docs:
        sys.stdout.write(str(doc) + ": ")
        sys.stdout.flush()

        doc_words = set()

        if doc.nb_pages > max_pages:
            max_pages = doc.nb_pages

        for page in doc.pages:
            sys.stdout.write("%d " % (page.page_nb + 1))
            sys.stdout.flush()
            nb_pages += 1

            for line in page.text:
                for word in util.split_words(line):
                    # ignore words too short to be useful
                    if (len(word) < 4):
                        continue
                    if not word in words:
                        words.add(word)
                        total_nb_unique_words += 1
                    if not word in doc_words:
                        doc_words.add(word)
                        total_nb_unique_words_per_doc += 1

                    nb_words += 1
                    total_word_len += len(word)
                    if max_word_len < len(word):
                        max_word_len = len(word)

        sys.stdout.write("\n")

    print("")
    print("Statistics")
    print("==========")
    print("Total number of documents: %d" % nb_docs)
    print("Total number of pages: %d" % nb_pages)
    print("Total number of words: %d" % nb_words)
    print("Total words len: %d" % total_word_len)
    print("Total number of unique words: %d" % total_nb_unique_words)
    print("===")
    print("Maximum number of pages in one document: %d" % max_pages)
    print("Maximum word length: %d" % max_word_len)
    print("Average word length: %f" % (float(total_word_len) / float(nb_words)))
    
    print ("Average number of words per page: %f"
           % (float(nb_words) / float(nb_pages)))
    print ("Average number of words per document: %f"
           % (float(nb_words) / float(nb_docs)))
    print ("Average number of pages per document: %f"
           % (float(nb_pages) / float(nb_docs)))
    print ("Average number of unique words per document: %f"
           % (float(total_nb_unique_words_per_doc) / float(nb_docs)))
Exemplo n.º 16
0
            except IOError, exc:
                print "Unable to get boxes for '%s': %s" % (self.doc.docid, exc)
                # will fall back on pdf boxes
        except OSError, exc:  # os.stat() failed
            pass

        # fall back on what libpoppler tells us

        # TODO: Line support !

        txt = self.pdf_page.get_text()
        pdf_size = self.pdf_page.get_size()
        words = set()
        self.__boxes = []
        for line in txt.split("\n"):
            for word in split_words(unicode(line, encoding="utf-8")):
                words.add(word)
        for word in words:
            for rect in self.pdf_page.find_text(word):
                word_box = PdfWordBox(word, rect, pdf_size)
                line_box = PdfLineBox([word_box], rect, pdf_size)
                self.__boxes.append(line_box)
        return self.__boxes

    boxes = property(__get_boxes)

    def __render_img(self, factor):
        # TODO(Jflesch): In a perfect world, we shouldn't use ImageSurface.
        # we should draw directly on the GtkImage.window.cairo_create() context.
        # It would be much more efficient.
Exemplo n.º 17
0
def main():
    print "Opening index"
    print "============="
    pconfig = config.PaperworkConfig()
    dsearch = docsearch.DocSearch(pconfig.workdir)

    nb_words = 0
    nb_docs = (len(dsearch.docs))
    nb_pages = 0

    total_word_len = 0
    max_word_len = 0

    words = set()
    total_nb_unique_words = 0
    total_nb_unique_words_per_doc = 0

    print ""
    print "Analysis"
    print "========"

    for doc in dsearch.docs:
        sys.stdout.write(str(doc) + ": ")
        sys.stdout.flush()

        doc_words = set()

        for page in doc.pages:
            sys.stdout.write("%d " % (page.page_nb + 1))
            sys.stdout.flush()
            nb_pages += 1

            for line in page.text:
                for word in util.split_words(line):
                    # ignore words too short to be useful
                    if (len(word) < 4):
                        continue
                    if not word in words:
                        words.add(word)
                        total_nb_unique_words += 1
                    if not word in doc_words:
                        doc_words.add(word)
                        total_nb_unique_words_per_doc += 1

                    nb_words += 1
                    total_word_len += len(word)
                    if max_word_len < len(word):
                        max_word_len = len(word)

        sys.stdout.write("\n")

    print ""
    print "Statistics"
    print "=========="
    print "Total number of documents: %d" % nb_docs
    print "Total number of pages: %d" % nb_pages
    print "Total number of words: %d" % total_word_len
    print "Total number of unique words: %d" % total_nb_unique_words
    print "==="
    print "Maximum word length: %d" % max_word_len
    print "Average word length: %f" % (float(total_word_len) / float(nb_words))
    print("Average number of words per page: %f" %
          (float(nb_words) / float(nb_pages)))
    print("Average number of words per document: %f" %
          (float(nb_words) / float(nb_docs)))
    print("Average number of pages per document: %f" %
          (float(nb_pages) / float(nb_docs)))
    print("Average number of unique words per document: %f" %
          (float(total_nb_unique_words_per_doc) / float(nb_docs)))