def get_boxes(self, sentence): """ Get all the boxes corresponding the given sentence Arguments: sentence --- can be string (will be splited), or an array of strings Returns: an array of boxes (see pyocr boxes) """ if isinstance(sentence, unicode): keywords = split_words(sentence) else: assert(isinstance(sentence, list)) keywords = sentence output = [] for keyword in keywords: for box in self.boxes: # unfold generator output words = [] for word in split_words(box.content): words.append(word) if keyword in words: output.append(box) return output
def get_boxes(self, sentence): """ Get all the boxes corresponding the given sentence Arguments: sentence --- can be string (will be splited), or an array of strings Returns: an array of boxes (see pyocr boxes) """ if isinstance(sentence, unicode): keywords = split_words(sentence) else: assert (isinstance(sentence, list)) keywords = sentence output = [] for keyword in keywords: for line in self.boxes: for box in line.word_boxes: if keyword in box.content: output.append(box) continue # unfold generator output words = [x for x in split_words(box.content)] if keyword in words: output.append(box) continue return output
def find_documents(self, sentence): """ Returns all the documents matching the given keywords Arguments: keywords --- keywords (single string) Returns: An array of document id (strings) """ if sentence.strip() == "": return self.docs[:] positive_keywords = [] negative_keywords = [] print ("Looking for documents containing %s" % (sentence.encode('ascii', 'replace'))) for keyword in split_words(sentence): if keyword[:1] != "!": positive_keywords.append(keyword) else: negative_keywords.append(keyword[1:]) if (len(positive_keywords) == 0 and len(negative_keywords) == 0): return [] documents = None if len(positive_keywords) <= 0: positive_keywords = ["*"] for keyword in positive_keywords: docs = self.__find_documents(keyword) if documents == None: documents = docs else: # intersection of both arrays documents = [val for val in documents if val in docs] if documents == None: return [] print "Found %d documents" % (len(documents)) for keyword in negative_keywords: docs = self.__find_documents(keyword) print "Found %d documents to remove" % (len(documents)) for doc in docs: try: documents.remove(doc) except ValueError: pass documents.sort() return documents
def __contains__(self, sentence): words = split_words(sentence) words = [word.lower() for word in words] txt = self.text for line in txt: line = line.lower() for word in words: if word in line: return True return False
def __get_keywords(self): """ Get all the keywords related of this page Returns: An array of strings """ for line in self.text: for word in split_words(line): yield (word)
def __get_keywords(self): """ Get all the keywords related of this page Returns: An array of strings """ for line in self.text: for word in split_words(line): yield(word)
def find_documents(self, sentence): """ Returns all the documents matching the given keywords Arguments: keywords --- keywords (single string) Returns: An array of document id (strings) """ if sentence.strip() == "": return self.docs[:] positive_keywords = [] negative_keywords = [] print("Looking for documents containing %s" % (sentence.encode('ascii', 'replace'))) for keyword in split_words(sentence): if keyword[:1] != "!": positive_keywords.append(keyword) else: negative_keywords.append(keyword[1:]) if (len(positive_keywords) == 0 and len(negative_keywords) == 0): return [] documents = None if len(positive_keywords) <= 0: positive_keywords = ["*"] for keyword in positive_keywords: docs = self.__find_documents(keyword) if documents == None: documents = docs else: documents.intersection_update(docs) if documents == None: return [] print "Found %d documents" % (len(documents)) for keyword in negative_keywords: docs = self.__find_documents(keyword) print "Found %d documents to remove" % (len(documents)) documents.symmetric_difference_update(docs) documents = list(documents) documents.sort() return documents
def find_documents(self, sentence): """ Returns all the documents matching the given keywords Arguments: keywords --- keywords (single string) Returns: An array of document id (strings) """ if sentence.strip() == "": return self.docs[:] positive_keywords = [] negative_keywords = [] print ("Looking for documents containing %s" % (sentence.encode('ascii', 'replace'))) for keyword in split_words(sentence): if keyword[:1] != "!": positive_keywords.append(keyword) else: negative_keywords.append(keyword[1:]) if (len(positive_keywords) == 0 and len(negative_keywords) == 0): return [] documents = None if len(positive_keywords) <= 0: positive_keywords = ["*"] for keyword in positive_keywords: docs = self.__find_documents(keyword) if documents == None: documents = docs else: documents.intersection_update(docs) if documents == None: return [] print "Found %d documents" % (len(documents)) for keyword in negative_keywords: docs = self.__find_documents(keyword) print "Found %d documents to remove" % (len(documents)) documents.symmetric_difference_update(docs) documents = list(documents) documents.sort() return documents
def add_label(self, label, doc): """ Add a new label to the list of known labels. Arguments: label --- The new label (see labels.Label) doc --- The first document on which this label has been added """ label_words = split_words(label.name) for word in label_words: self.__index_keyword(doc, word) if not word in self.__keywords: self.__keywords.append(word) self.__keywords.sort() if not label in self.label_list: self.label_list.append(label) self.label_list.sort()
def add_label(self, label, doc): """ Add a new label to the list of known labels. Arguments: label --- The new label (see labels.Label) doc --- The first document on which this label has been added """ label_words = split_words(label.name) for word in label_words: if not word in self.__keyword_to_docs: self.__keywords.append(word) self.__keywords.sort() self.__keyword_to_docs[word] = set([doc]) else: self.__keyword_to_docs[word].add(doc) if not label in self.label_list: self.label_list.append(label) self.label_list.sort()
def find_suggestions(self, sentence): """ Search all possible suggestions. Suggestions returned always have at least one document matching. Arguments: sentence --- keywords (single strings) for which we want suggestions Return: An array of sets of keywords. Each set of keywords (-> one string) is a suggestion. """ keywords = split_words(sentence) results = self.__find_suggestions(self.__unfold_generator(keywords)) try: results.remove(sentence) # remove strict match if it is here except ValueError: pass results.sort() return results
def find_suggestions(self, sentence): """ Search all possible suggestions. Suggestions returned always have at least one document matching. Arguments: sentence --- keywords (single strings) for which we want suggestions Return: An array of sets of keywords. Each set of keywords (-> one string) is a suggestion. """ keywords = split_words(sentence) results = self.__find_suggestions([x for x in keywords]) try: results.remove(sentence) # remove strict match if it is here except ValueError: pass results.sort() return results
logger.error("Unable to get boxes for '%s': %s" % (self.doc.docid, exc)) # will fall back on pdf boxes except OSError, exc: # os.stat() failed pass # fall back on what libpoppler tells us # TODO: Line support ! txt = self.pdf_page.get_text() pdf_size = self.pdf_page.get_size() words = set() self.__boxes = [] for line in txt.split("\n"): for word in split_words(unicode(line, encoding='utf-8')): words.add(word) for word in words: for rect in self.pdf_page.find_text(word): word_box = PdfWordBox(word, rect, pdf_size) line_box = PdfLineBox([word_box], rect, pdf_size) self.__boxes.append(line_box) return self.__boxes boxes = property(__get_boxes) def __render_img(self, factor): # TODO(Jflesch): In a perfect world, we shouldn't use ImageSurface. # we should draw directly on the GtkImage.window.cairo_create() # context. It would be much more efficient.
def main(): pconfig = config.PaperworkConfig() pconfig.read() print("Opening docs (%s)" % pconfig.workdir) print("====================") dsearch = docsearch.DocSearch(pconfig.workdir) nb_words = 0 nb_docs = (len(dsearch.docs)) nb_pages = 0 max_pages = 0 total_word_len = 0 max_word_len = 0 words = set() total_nb_unique_words = 0 total_nb_unique_words_per_doc = 0 print("") print("Analysis") print("========") for doc in dsearch.docs: sys.stdout.write(str(doc) + ": ") sys.stdout.flush() doc_words = set() if doc.nb_pages > max_pages: max_pages = doc.nb_pages for page in doc.pages: sys.stdout.write("%d " % (page.page_nb + 1)) sys.stdout.flush() nb_pages += 1 for line in page.text: for word in util.split_words(line): # ignore words too short to be useful if (len(word) < 4): continue if not word in words: words.add(word) total_nb_unique_words += 1 if not word in doc_words: doc_words.add(word) total_nb_unique_words_per_doc += 1 nb_words += 1 total_word_len += len(word) if max_word_len < len(word): max_word_len = len(word) sys.stdout.write("\n") print("") print("Statistics") print("==========") print("Total number of documents: %d" % nb_docs) print("Total number of pages: %d" % nb_pages) print("Total number of words: %d" % nb_words) print("Total words len: %d" % total_word_len) print("Total number of unique words: %d" % total_nb_unique_words) print("===") print("Maximum number of pages in one document: %d" % max_pages) print("Maximum word length: %d" % max_word_len) print("Average word length: %f" % (float(total_word_len) / float(nb_words))) print ("Average number of words per page: %f" % (float(nb_words) / float(nb_pages))) print ("Average number of words per document: %f" % (float(nb_words) / float(nb_docs))) print ("Average number of pages per document: %f" % (float(nb_pages) / float(nb_docs))) print ("Average number of unique words per document: %f" % (float(total_nb_unique_words_per_doc) / float(nb_docs)))
except IOError, exc: print "Unable to get boxes for '%s': %s" % (self.doc.docid, exc) # will fall back on pdf boxes except OSError, exc: # os.stat() failed pass # fall back on what libpoppler tells us # TODO: Line support ! txt = self.pdf_page.get_text() pdf_size = self.pdf_page.get_size() words = set() self.__boxes = [] for line in txt.split("\n"): for word in split_words(unicode(line, encoding="utf-8")): words.add(word) for word in words: for rect in self.pdf_page.find_text(word): word_box = PdfWordBox(word, rect, pdf_size) line_box = PdfLineBox([word_box], rect, pdf_size) self.__boxes.append(line_box) return self.__boxes boxes = property(__get_boxes) def __render_img(self, factor): # TODO(Jflesch): In a perfect world, we shouldn't use ImageSurface. # we should draw directly on the GtkImage.window.cairo_create() context. # It would be much more efficient.
def main(): print "Opening index" print "=============" pconfig = config.PaperworkConfig() dsearch = docsearch.DocSearch(pconfig.workdir) nb_words = 0 nb_docs = (len(dsearch.docs)) nb_pages = 0 total_word_len = 0 max_word_len = 0 words = set() total_nb_unique_words = 0 total_nb_unique_words_per_doc = 0 print "" print "Analysis" print "========" for doc in dsearch.docs: sys.stdout.write(str(doc) + ": ") sys.stdout.flush() doc_words = set() for page in doc.pages: sys.stdout.write("%d " % (page.page_nb + 1)) sys.stdout.flush() nb_pages += 1 for line in page.text: for word in util.split_words(line): # ignore words too short to be useful if (len(word) < 4): continue if not word in words: words.add(word) total_nb_unique_words += 1 if not word in doc_words: doc_words.add(word) total_nb_unique_words_per_doc += 1 nb_words += 1 total_word_len += len(word) if max_word_len < len(word): max_word_len = len(word) sys.stdout.write("\n") print "" print "Statistics" print "==========" print "Total number of documents: %d" % nb_docs print "Total number of pages: %d" % nb_pages print "Total number of words: %d" % total_word_len print "Total number of unique words: %d" % total_nb_unique_words print "===" print "Maximum word length: %d" % max_word_len print "Average word length: %f" % (float(total_word_len) / float(nb_words)) print("Average number of words per page: %f" % (float(nb_words) / float(nb_pages))) print("Average number of words per document: %f" % (float(nb_words) / float(nb_docs))) print("Average number of pages per document: %f" % (float(nb_pages) / float(nb_docs))) print("Average number of unique words per document: %f" % (float(total_nb_unique_words_per_doc) / float(nb_docs)))