示例#1
0
class WhooshManager(models.Manager):
    def __init__(self, *args, **kwargs):
        self.default = kwargs.pop("default",None)
        self.parser = None
        self.fields = kwargs.pop('fields', []) + ['id']
        self.real_time = kwargs.pop('real_time', True)
        if not os.path.lexists(STORAGE_DIR):
            os.makedirs(STORAGE_DIR)
        self.storage = store.FileStorage(STORAGE_DIR)
        try:
            self.index = Index(self.storage)
        except (IndexError, EmptyIndexError):
            self.index = None
        super(WhooshManager, self).__init__(*args, **kwargs)
    
    def contribute_to_class(self, model, name):
        super(WhooshManager, self).contribute_to_class(model, name)
        class_prepared.connect(self.class_prepared_callback, sender=self.model)
    
    def class_prepared_callback(self, sender, **kwargs):
        schema_dict = {}
        for field_name in self.fields:
            field = self.model._meta.get_field_by_name(field_name)[0]
            schema_dict[field.name] = field_mapping[field.__class__]
        self.schema = Schema(**schema_dict)
        if self.index is None:
            self.index = Index(self.storage, schema=self.schema, create=True)
        self.searcher = self.index.searcher()
        if self.real_time:
            post_save.connect(self.post_save_callback, sender=self.model)
            post_delete.connect(self.post_delete_callback, sender=self.model)
    
    def post_save_callback(self, sender, instance, created, **kwargs):
        dct = dict([(f, unicode(getattr(instance, f))) for f in self.fields])
        self.index = self.index.refresh()
        writer = self.index.writer()
        if created:
            writer.add_document(**dct)
        else:
            writer.update_document(**dct)
        writer.commit()
        self.index = self.index.refresh()
        self.searcher = self.index.searcher()
        
    
    def post_delete_callback(self, sender, instance, **kwargs):
        pass
    
    def query(self, q):
        if self.parser is None:
            self.parser = QueryParser(self.default, schema=self.schema)
        results = self.searcher.search(self.parser.parse(q+"*"))
        return self.filter(id__in=[r['id'] for r in results])
示例#2
0
    def __query(self, index: Index, text: str,
                domains: Set[str]) -> pd.DataFrame:
        q = self.name_parser.parse(text)

        with index.searcher() as s:
            results = []
            for hit in s.search(q, limit=6):
                ds = set((hit.get('domains') or '').split(','))
                results.append({
                    'raw_score':
                    hit.score,
                    'id':
                    hit['id'],
                    'name':
                    hit['name'],
                    'domains_boost':
                    self.matching_domains_boost if len(ds & domains) > 0 else 1
                })

        if len(results) == 0:
            return pd.DataFrame()

        df = pd.DataFrame.from_records(results, index='id')

        # Compute accurate score based on string similarity (lowercased)
        df['score'] = df['name'].apply(
            # "Sharpen" the similarity to make it more intuitive
            lambda name: jellyfish.jaro_winkler_similarity(
                name.lower(), text.lower())**1.5)

        df['score'] = df['score'] * df[
            'domains_boost'] / self.matching_domains_boost
        df = df.sort_values(by='score', ascending=False)
        return df.reset_index(drop=True)
示例#3
0
class WhooshManager(models.Manager):
    def __init__(self, *args, **kwargs):
        self.default = kwargs.pop("default", None)
        self.parser = None
        self.fields = kwargs.pop('fields', []) + ['id']
        self.real_time = kwargs.pop('real_time', True)
        if not os.path.lexists(STORAGE_DIR):
            os.makedirs(STORAGE_DIR)
        self.storage = store.FileStorage(STORAGE_DIR)
        try:
            self.index = Index(self.storage)
        except (IndexError, EmptyIndexError):
            self.index = None
        super(WhooshManager, self).__init__(*args, **kwargs)

    def contribute_to_class(self, model, name):
        super(WhooshManager, self).contribute_to_class(model, name)
        class_prepared.connect(self.class_prepared_callback, sender=self.model)

    def class_prepared_callback(self, sender, **kwargs):
        schema_dict = {}
        for field_name in self.fields:
            field = self.model._meta.get_field_by_name(field_name)[0]
            schema_dict[field.name] = field_mapping[field.__class__]
        self.schema = Schema(**schema_dict)
        if self.index is None:
            self.index = Index(self.storage, schema=self.schema, create=True)
        self.searcher = self.index.searcher()
        if self.real_time:
            post_save.connect(self.post_save_callback, sender=self.model)
            post_delete.connect(self.post_delete_callback, sender=self.model)

    def post_save_callback(self, sender, instance, created, **kwargs):
        dct = dict([(f, unicode(getattr(instance, f))) for f in self.fields])
        self.index = self.index.refresh()
        writer = self.index.writer()
        if created:
            writer.add_document(**dct)
        else:
            writer.update_document(**dct)
        writer.commit()

    def post_delete_callback(self, sender, instance, **kwargs):
        pass

    def query(self, q):
        if self.parser is None:
            self.parser = QueryParser(self.default, schema=self.schema)
        results = self.searcher.search(self.parser.parse(q))
        return self.filter(id__in=[r['id'] for r in results])