def reindex(self, dirty=None): # Determine indexable attributes and their weights. indexable = {} for base in reversed(type.mro(type(self))): _ = getattr(base, '_indexable', {}) if isinstance(_, dict): indexable.update(_) elif isinstance(_, list): indexable.update(dict.fromkeys(_, 1.0)) # Determine if we actually need to re-index or not. if dirty is not None and not set(indexable.keys()).intersection(set(dirty)): return index = DocumentIndex.objects(doc_id=self.id).first() if index: index.delete() # Determine the number of occurrences of each term with a per-attribute weight. occurrences = defaultdict(float) for attr, weight in indexable.iteritems(): value = getattr(self, attr) if isinstance(value, basestring): for word in lexer.strip(value): occurrences[word] += weight elif isinstance(value, (tuple, list, set)): for word in lexer.strip(u' '.join(value).encode('utf8')): occurrences[word] += weight # Save the index and terms. index = DocumentIndex(doc_id=str(self.id), length=len(occurrences), terms=occurrences) index.save(safe=False)
def delete(self, safe=False): # Delete index data, if any. index = DocumentIndex.objects(id=self.id).first() if index: index.delete() # Depth-first cascading delete. for i in self.children: i.delete() # Remove reference to self from parent asset. if self.parent: self.parent.children.remove(self) self.parent.save() # Actually delete this asset. return super(Asset, self).delete(safe=safe)
def results(self, query=None): if query is None: query = self.query if query is None: return [] terms = keywords(' '.join(strip(query.lower()))) terms = (set(terms[0] + terms[1]), set(terms[2])) query = dict() aquery = dict() for term in list(terms[0]): if ':' in term: terms[0].remove(term) l, _, r = term.partition(':') if l == 'tag': aquery.setdefault('tags', list()).append(r) elif l == 'kind': aquery.setdefault('__raw__', dict())['_cls'] = { '$regex' : r, '$options': 'i' } if not terms[0] and not terms[1]: def gen(): for record in Asset.objects(**aquery).only('title', 'description', 'path', 'acl').order_by('created'): yield 1.0, record return gen() for term in terms[0]: query['terms__%s__exists' % (term, )] = True for term in terms[1]: query['terms__%s__exists' % (term, )] = False # Calculate the inverse document frequency for each term idfs = {} num_docs = DocumentIndex.objects.count() for term in terms[0]: term_docs = DocumentIndex.objects(terms__term=term).count() idfs[term] = log_((num_docs - term_docs + 0.5) / (term_docs + 0.5)) # Get the average document length. avg_doc_length = sum([i.length for i in DocumentIndex.objects.only('length')])/float(num_docs) k = 2.0 b = 0.75 f = [] results = [] def compute(idfs, idx, k, b, f): score = 0.0 for term, q in idfs.iteritems(): dividend = idx.terms[term] * (k + 1.0) relDocSize = idx.length / avg_doc_length divisor = q + ( 1.0 - b + b * relDocSize ) * k termScore = (dividend / divisor) * q score += termScore return (score, idx.doc_id) with futures.ThreadPoolExecutor(max_workers=5) as executor: for idx in DocumentIndex.objects(**query): f.append(executor.submit(compute, idfs, idx, k, b, f)) for result in futures.as_completed(f): score, doc_id = result.result() results.append((score, doc_id)) def iterresults(): for score, id_ in results: yield score, Asset.objects(id=id_, **aquery).only('title', 'description', 'path', 'acl').first() return sorted(iterresults(), lambda a, b: cmp(a[0], b[0]), reverse=True)