def _update_doc_in_index(self, index_writer, doc): """ Add/Update a document in the index """ all_labels = set(self.docsearch.label_list) doc_labels = set(doc.labels) new_labels = doc_labels.difference(all_labels) # can happen when we recreate the index from scract for label in new_labels: self.docsearch.create_label(label) last_mod = datetime.datetime.fromtimestamp(doc.last_mod) docid = unicode(doc.docid) dochash = doc.get_docfilehash() dochash = (u"%X" % dochash) doc_txt = doc.get_index_text() assert (isinstance(doc_txt, unicode)) labels_txt = doc.get_index_labels() assert (isinstance(labels_txt, unicode)) query = whoosh.query.Term("docid", docid) index_writer.delete_by_query(query) index_writer.update_document( docid=docid, doctype=doc.doctype, docfilehash=dochash, content=strip_accents(doc.get_index_text()), label=strip_accents(doc.get_index_labels()), date=doc.date, last_read=last_mod) return True
def _update_doc_in_index(self, index_writer, doc, fit_label_estimator=True): """ Add/Update a document in the index """ all_labels = set(self.docsearch.label_list) doc_labels = set(doc.labels) new_labels = doc_labels.difference(all_labels) if new_labels != set(): for label in new_labels: self.docsearch.label_list += [label] self.docsearch.label_list.sort() if fit_label_estimator: self.docsearch.fit_label_estimator(labels=new_labels) if fit_label_estimator: self.docsearch.fit_label_estimator([doc]) last_mod = datetime.datetime.fromtimestamp(doc.last_mod) docid = unicode(doc.docid) dochash = doc.get_docfilehash() dochash = (u"%X" % dochash) index_writer.update_document( docid=docid, doctype=doc.doctype, docfilehash=dochash, content=strip_accents(doc.get_index_text()), label=strip_accents(doc.get_index_labels()), date=doc.date, last_read=last_mod) return True
def __label_cmp(self, other): """ Comparaison function. Can be used to sort labels alphabetically. """ if other is None: return -1 label_name = strip_accents(self.name).lower() other_name = strip_accents(other.name).lower() cmp_r = cmp(label_name, other_name) if cmp_r != 0: return cmp_r return cmp(self.get_color_str(), other.get_color_str())
def __label_cmp(self, other, text_only=False): """ Comparaison function. Can be used to sort labels alphabetically. Labels are deemed equal if they have the same (or similar) text, regardless of color. """ if other is None: return -1 label_name = strip_accents(self.name).lower() other_name = strip_accents(other.name).lower() cmp_r = cmp(label_name, other_name) if cmp_r != 0 or text_only: return cmp_r return cmp(self.get_color_str(), other.get_color_str())
def find_documents(self, sentence, limit=None, must_sort=True, search_type='fuzzy'): """ Returns all the documents matching the given keywords Arguments: sentence --- a sentenced query Returns: An array of document (doc objects) """ sentence = sentence.strip() sentence = strip_accents(sentence) if sentence == u"": return self.docs result_list_list = [] total_results = 0 for query_parser in self.search_param_list[search_type]: query = query_parser["query_parser"].parse(sentence) if must_sort and "sortedby" in query_parser: result_list = self.__searcher.search( query, limit=limit, sortedby=query_parser["sortedby"]) else: result_list = self.__searcher.search(query, limit=limit) result_list_list.append(result_list) total_results += len(result_list) if not must_sort and total_results >= limit: break # merging results results = result_list_list[0] for result_intermediate in result_list_list[1:]: results.extend(result_intermediate) docs = [self._docs_by_id.get(result['docid']) for result in results] try: while True: docs.remove(None) except ValueError: pass assert (None not in docs) if limit is not None: docs = docs[:limit] return docs
def get_index_text(self): txt = u"" for page in self.pages: txt += u"\n".join([unicode(line) for line in page.text]) extra_txt = self.extra_text if extra_txt != u"": txt += extra_txt + u"\n" txt = txt.strip() txt = strip_accents(txt) if txt == u"": # make sure the text field is not empty. Whoosh doesn't like that txt = u"empty" return txt
def get_index_labels(self): return u",".join([strip_accents(unicode(label.name)) for label in self.labels])