示例#1
0
    def __init__(self, env, lang, options, scoring):
        self.env = env
        # filename -> title
        self._titles = {}
        # stemmed word -> set(filenames)
        self._mapping = {}
        # stemmed words in titles -> set(filenames)
        self._title_mapping = {}
        # word -> stemmed word
        self._stem_cache = {}
        # objtype -> index
        self._objtypes = {}
        # objtype index -> (domain, type, objname (localized))
        self._objnames = {}
        # add language-specific SearchLanguage instance
        lang_class = languages.get(lang)
        if lang_class is None:
            self.lang = SearchEnglish(options)
        elif isinstance(lang_class, str):
            module, classname = lang_class.rsplit('.', 1)
            lang_class = getattr(__import__(module, None, None, [classname]),
                                 classname)
            self.lang = lang_class(options)
        else:
            # it's directly a class (e.g. added by app.add_search_language)
            self.lang = lang_class(options)

        if scoring:
            with open(scoring, 'rb') as fp:
                self.js_scorer_code = fp.read().decode('utf-8')
        else:
            self.js_scorer_code = u''
示例#2
0
    def __init__(self, env, lang, options, scoring):
        # type: (BuildEnvironment, unicode, Dict, unicode) -> None
        self.env = env
        self._titles = {}           # type: Dict[unicode, unicode]
                                    # docname -> title
        self._filenames = {}        # type: Dict[unicode, unicode]
                                    # docname -> filename
        self._mapping = {}          # type: Dict[unicode, Set[unicode]]
                                    # stemmed word -> set(docname)
        self._title_mapping = {}    # type: Dict[unicode, Set[unicode]]
                                    # stemmed words in titles -> set(docname)
        self._stem_cache = {}       # type: Dict[unicode, unicode]
                                    # word -> stemmed word
        self._objtypes = {}         # type: Dict[Tuple[unicode, unicode], int]
                                    # objtype -> index
        self._objnames = {}         # type: Dict[int, Tuple[unicode, unicode, unicode]]
                                    # objtype index -> (domain, type, objname (localized))
        lang_class = languages.get(lang)    # type: Type[SearchLanguage]
                                            # add language-specific SearchLanguage instance
        if lang_class is None:
            self.lang = SearchEnglish(options)  # type: SearchLanguage
        elif isinstance(lang_class, str):
            module, classname = lang_class.rsplit('.', 1)
            lang_class = getattr(__import__(module, None, None, [classname]),
                                 classname)
            self.lang = lang_class(options)
        else:
            # it's directly a class (e.g. added by app.add_search_language)
            self.lang = lang_class(options)

        if scoring:
            with open(scoring, 'rb') as fp:
                self.js_scorer_code = fp.read().decode('utf-8')
        else:
            self.js_scorer_code = u''
示例#3
0
    def __init__(self, env: BuildEnvironment, lang: str, options: Dict,
                 scoring: str) -> None:
        self.env = env
        self._titles: Dict[str, str] = {}  # docname -> title
        self._filenames: Dict[str, str] = {}  # docname -> filename
        self._mapping: Dict[str, Set[str]] = {}  # stemmed word -> set(docname)
        # stemmed words in titles -> set(docname)
        self._title_mapping: Dict[str, Set[str]] = {}
        self._stem_cache: Dict[str, str] = {}  # word -> stemmed word
        self._objtypes: Dict[Tuple[str, str], int] = {}  # objtype -> index
        # objtype index -> (domain, type, objname (localized))
        self._objnames: Dict[int, Tuple[str, str, str]] = {}
        # add language-specific SearchLanguage instance
        lang_class: Type[SearchLanguage] = languages.get(lang)

        # fallback; try again with language-code
        if lang_class is None and '_' in lang:
            lang_class = languages.get(lang.split('_')[0])

        if lang_class is None:
            self.lang: SearchLanguage = SearchEnglish(options)
        elif isinstance(lang_class, str):
            module, classname = lang_class.rsplit('.', 1)
            lang_class = getattr(import_module(module), classname)
            self.lang = lang_class(options)
        else:
            # it's directly a class (e.g. added by app.add_search_language)
            self.lang = lang_class(options)

        if scoring:
            with open(scoring, 'rb') as fp:
                self.js_scorer_code = fp.read().decode()
        else:
            self.js_scorer_code = ''
        self.js_splitter_code = splitter_code
示例#4
0
文件: __init__.py 项目: nvmanh/plant
    def __init__(self, env, lang, options, scoring):
        # type: (BuildEnvironment, unicode, Dict, unicode) -> None
        self.env = env
        self._titles = {}           # type: Dict[unicode, unicode]
                                    # docname -> title
        self._filenames = {}        # type: Dict[unicode, unicode]
                                    # docname -> filename
        self._mapping = {}          # type: Dict[unicode, Set[unicode]]
                                    # stemmed word -> set(docname)
        self._title_mapping = {}    # type: Dict[unicode, Set[unicode]]
                                    # stemmed words in titles -> set(docname)
        self._stem_cache = {}       # type: Dict[unicode, unicode]
                                    # word -> stemmed word
        self._objtypes = {}         # type: Dict[Tuple[unicode, unicode], int]
                                    # objtype -> index
        self._objnames = {}         # type: Dict[int, Tuple[unicode, unicode, unicode]]
                                    # objtype index -> (domain, type, objname (localized))
        lang_class = languages.get(lang)    # type: Type[SearchLanguage]
                                            # add language-specific SearchLanguage instance

        # fallback; try again with language-code
        if lang_class is None and '_' in lang:
            lang_class = languages.get(lang.split('_')[0])

        if lang_class is None:
            self.lang = SearchEnglish(options)  # type: SearchLanguage
        elif isinstance(lang_class, str):
            module, classname = lang_class.rsplit('.', 1)
            lang_class = getattr(__import__(module, None, None, [classname]),
                                 classname)
            self.lang = lang_class(options)
        else:
            # it's directly a class (e.g. added by app.add_search_language)
            self.lang = lang_class(options)

        if scoring:
            with open(scoring, 'rb') as fp:
                self.js_scorer_code = fp.read().decode('utf-8')
        else:
            self.js_scorer_code = u''
        self.js_splitter_code = splitter_code
示例#5
0
文件: __init__.py 项目: hagenw/sphinx
class IndexBuilder(object):
    """
    Helper class that creates a searchindex based on the doctrees
    passed to the `feed` method.
    """
    formats = {
        'jsdump':   jsdump,
        'pickle':   pickle
    }   # type: Dict[unicode, Any]

    def __init__(self, env, lang, options, scoring):
        # type: (BuildEnvironment, unicode, Dict, unicode) -> None
        self.env = env
        self._titles = {}           # type: Dict[unicode, unicode]
                                    # docname -> title
        self._filenames = {}        # type: Dict[unicode, unicode]
                                    # docname -> filename
        self._mapping = {}          # type: Dict[unicode, Set[unicode]]
                                    # stemmed word -> set(docname)
        self._title_mapping = {}    # type: Dict[unicode, Set[unicode]]
                                    # stemmed words in titles -> set(docname)
        self._stem_cache = {}       # type: Dict[unicode, unicode]
                                    # word -> stemmed word
        self._objtypes = {}         # type: Dict[Tuple[unicode, unicode], int]
                                    # objtype -> index
        self._objnames = {}         # type: Dict[int, Tuple[unicode, unicode, unicode]]
                                    # objtype index -> (domain, type, objname (localized))
        lang_class = languages.get(lang)    # type: Type[SearchLanguage]
                                            # add language-specific SearchLanguage instance
        if lang_class is None:
            self.lang = SearchEnglish(options)  # type: SearchLanguage
        elif isinstance(lang_class, str):
            module, classname = lang_class.rsplit('.', 1)
            lang_class = getattr(__import__(module, None, None, [classname]),
                                 classname)
            self.lang = lang_class(options)
        else:
            # it's directly a class (e.g. added by app.add_search_language)
            self.lang = lang_class(options)

        if scoring:
            with open(scoring, 'rb') as fp:
                self.js_scorer_code = fp.read().decode('utf-8')
        else:
            self.js_scorer_code = u''
        self.js_splitter_code = splitter_code

    def load(self, stream, format):
        # type: (IO, Any) -> None
        """Reconstruct from frozen data."""
        if isinstance(format, string_types):
            format = self.formats[format]
        frozen = format.load(stream)
        # if an old index is present, we treat it as not existing.
        if not isinstance(frozen, dict) or \
           frozen.get('envversion') != self.env.version:
            raise ValueError('old format')
        index2fn = frozen['docnames']
        self._filenames = dict(zip(index2fn, frozen['filenames']))
        self._titles = dict(zip(index2fn, frozen['titles']))

        def load_terms(mapping):
            # type: (Dict[unicode, Any]) -> Dict[unicode, Set[unicode]]
            rv = {}
            for k, v in iteritems(mapping):
                if isinstance(v, int):
                    rv[k] = set([index2fn[v]])
                else:
                    rv[k] = set(index2fn[i] for i in v)
            return rv

        self._mapping = load_terms(frozen['terms'])
        self._title_mapping = load_terms(frozen['titleterms'])
        # no need to load keywords/objtypes

    def dump(self, stream, format):
        # type: (IO, Any) -> None
        """Dump the frozen index to a stream."""
        if isinstance(format, string_types):
            format = self.formats[format]
        format.dump(self.freeze(), stream)

    def get_objects(self, fn2index):
        # type: (Dict[unicode, int]) -> Dict[unicode, Dict[unicode, Tuple[int, int, int, unicode]]]  # NOQA
        rv = {}  # type: Dict[unicode, Dict[unicode, Tuple[int, int, int, unicode]]]
        otypes = self._objtypes
        onames = self._objnames
        for domainname, domain in sorted(iteritems(self.env.domains)):
            for fullname, dispname, type, docname, anchor, prio in \
                    sorted(domain.get_objects()):
                # XXX use dispname?
                if docname not in fn2index:
                    continue
                if prio < 0:
                    continue
                fullname = htmlescape(fullname)
                prefix, name = rpartition(fullname, '.')
                pdict = rv.setdefault(prefix, {})
                try:
                    typeindex = otypes[domainname, type]
                except KeyError:
                    typeindex = len(otypes)
                    otypes[domainname, type] = typeindex
                    otype = domain.object_types.get(type)
                    if otype:
                        # use unicode() to fire translation proxies
                        onames[typeindex] = (domainname, type,
                                             text_type(domain.get_type_name(otype)))
                    else:
                        onames[typeindex] = (domainname, type, type)
                if anchor == fullname:
                    shortanchor = ''  # type: unicode
                elif anchor == type + '-' + fullname:
                    shortanchor = '-'
                else:
                    shortanchor = anchor
                pdict[name] = (fn2index[docname], typeindex, prio, shortanchor)
        return rv

    def get_terms(self, fn2index):
        # type: (Dict) -> Tuple[Dict[unicode, List[unicode]], Dict[unicode, List[unicode]]]
        rvs = {}, {}  # type: Tuple[Dict[unicode, List[unicode]], Dict[unicode, List[unicode]]]
        for rv, mapping in zip(rvs, (self._mapping, self._title_mapping)):
            for k, v in iteritems(mapping):
                if len(v) == 1:
                    fn, = v
                    if fn in fn2index:
                        rv[k] = fn2index[fn]
                else:
                    rv[k] = sorted([fn2index[fn] for fn in v if fn in fn2index])
        return rvs

    def freeze(self):
        # type: () -> Dict[unicode, Any]
        """Create a usable data structure for serializing."""
        docnames, titles = zip(*sorted(self._titles.items()))
        filenames = [self._filenames.get(docname) for docname in docnames]
        fn2index = dict((f, i) for (i, f) in enumerate(docnames))
        terms, title_terms = self.get_terms(fn2index)

        objects = self.get_objects(fn2index)  # populates _objtypes
        objtypes = dict((v, k[0] + ':' + k[1])
                        for (k, v) in iteritems(self._objtypes))
        objnames = self._objnames
        return dict(docnames=docnames, filenames=filenames, titles=titles, terms=terms,
                    objects=objects, objtypes=objtypes, objnames=objnames,
                    titleterms=title_terms, envversion=self.env.version)

    def label(self):
        # type: () -> unicode
        return "%s (code: %s)" % (self.lang.language_name, self.lang.lang)

    def prune(self, docnames):
        # type: (Iterable[unicode]) -> None
        """Remove data for all docnames not in the list."""
        new_titles = {}
        new_filenames = {}
        for docname in docnames:
            if docname in self._titles:
                new_titles[docname] = self._titles[docname]
                new_filenames[docname] = self._filenames[docname]
        self._titles = new_titles
        self._filenames = new_filenames
        for wordnames in itervalues(self._mapping):
            wordnames.intersection_update(docnames)
        for wordnames in itervalues(self._title_mapping):
            wordnames.intersection_update(docnames)

    def feed(self, docname, filename, title, doctree):
        # type: (unicode, unicode, unicode, nodes.Node) -> None
        """Feed a doctree to the index."""
        self._titles[docname] = title
        self._filenames[docname] = filename

        visitor = WordCollector(doctree, self.lang)
        doctree.walk(visitor)

        # memoize self.lang.stem
        def stem(word):
            # type: (unicode) -> unicode
            try:
                return self._stem_cache[word]
            except KeyError:
                self._stem_cache[word] = self.lang.stem(word).lower()
                return self._stem_cache[word]
        _filter = self.lang.word_filter

        for word in visitor.found_title_words:
            stemmed_word = stem(word)
            if _filter(stemmed_word):
                self._title_mapping.setdefault(stemmed_word, set()).add(docname)
            elif _filter(word): # stemmer must not remove words from search index
                self._title_mapping.setdefault(word, set()).add(docname)

        for word in visitor.found_words:
            stemmed_word = stem(word)
            # again, stemmer must not remove words from search index
            if not _filter(stemmed_word) and _filter(word):
                stemmed_word = word
            already_indexed = docname in self._title_mapping.get(stemmed_word, set())
            if _filter(stemmed_word) and not already_indexed:
                self._mapping.setdefault(stemmed_word, set()).add(docname)

    def context_for_searchtool(self):
        # type: () -> Dict[unicode, Any]
        return dict(
            search_language_stemming_code = self.lang.js_stemmer_code,
            search_language_stop_words = jsdump.dumps(sorted(self.lang.stopwords)),
            search_scorer_tool = self.js_scorer_code,
            search_word_splitter_code = self.js_splitter_code,
        )

    def get_js_stemmer_rawcode(self):
        # type: () -> unicode
        if self.lang.js_stemmer_rawcode:
            return path.join(
                sphinx.package_dir, 'search',
                'non-minified-js',
                self.lang.js_stemmer_rawcode
            )
        else:
            return None
示例#6
0
class IndexBuilder:
    """
    Helper class that creates a searchindex based on the doctrees
    passed to the `feed` method.
    """
    formats = {
        'jsdump':   jsdump,
        'pickle':   pickle
    }

    def __init__(self, env: BuildEnvironment, lang: str, options: Dict, scoring: str) -> None:
        self.env = env
        self._titles = {}           # type: Dict[str, str]
                                    # docname -> title
        self._filenames = {}        # type: Dict[str, str]
                                    # docname -> filename
        self._mapping = {}          # type: Dict[str, Set[str]]
                                    # stemmed word -> set(docname)
        self._title_mapping = {}    # type: Dict[str, Set[str]]
                                    # stemmed words in titles -> set(docname)
        self._stem_cache = {}       # type: Dict[str, str]
                                    # word -> stemmed word
        self._objtypes = {}         # type: Dict[Tuple[str, str], int]
                                    # objtype -> index
        self._objnames = {}         # type: Dict[int, Tuple[str, str, str]]
                                    # objtype index -> (domain, type, objname (localized))
        lang_class = languages.get(lang)    # type: Type[SearchLanguage]
                                            # add language-specific SearchLanguage instance

        # fallback; try again with language-code
        if lang_class is None and '_' in lang:
            lang_class = languages.get(lang.split('_')[0])

        if lang_class is None:
            self.lang = SearchEnglish(options)  # type: SearchLanguage
        elif isinstance(lang_class, str):
            module, classname = lang_class.rsplit('.', 1)
            lang_class = getattr(import_module(module), classname)
            self.lang = lang_class(options)
        else:
            # it's directly a class (e.g. added by app.add_search_language)
            self.lang = lang_class(options)

        if scoring:
            with open(scoring, 'rb') as fp:
                self.js_scorer_code = fp.read().decode()
        else:
            self.js_scorer_code = ''
        self.js_splitter_code = splitter_code

    def load(self, stream: IO, format: Any) -> None:
        """Reconstruct from frozen data."""
        if isinstance(format, str):
            format = self.formats[format]
        frozen = format.load(stream)
        # if an old index is present, we treat it as not existing.
        if not isinstance(frozen, dict) or \
           frozen.get('envversion') != self.env.version:
            raise ValueError('old format')
        index2fn = frozen['docnames']
        self._filenames = dict(zip(index2fn, frozen['filenames']))  # type: ignore
        self._titles = dict(zip(index2fn, frozen['titles']))  # type: ignore

        def load_terms(mapping: Dict[str, Any]) -> Dict[str, Set[str]]:
            rv = {}
            for k, v in mapping.items():
                if isinstance(v, int):
                    rv[k] = {index2fn[v]}
                else:
                    rv[k] = {index2fn[i] for i in v}
            return rv

        self._mapping = load_terms(frozen['terms'])
        self._title_mapping = load_terms(frozen['titleterms'])
        # no need to load keywords/objtypes

    def dump(self, stream: IO, format: Any) -> None:
        """Dump the frozen index to a stream."""
        if isinstance(format, str):
            format = self.formats[format]
        format.dump(self.freeze(), stream)

    def get_objects(self, fn2index: Dict[str, int]
                    ) -> Dict[str, Dict[str, Tuple[int, int, int, str]]]:
        rv = {}  # type: Dict[str, Dict[str, Tuple[int, int, int, str]]]
        otypes = self._objtypes
        onames = self._objnames
        for domainname, domain in sorted(self.env.domains.items()):
            for fullname, dispname, type, docname, anchor, prio in \
                    sorted(domain.get_objects()):
                if docname not in fn2index:
                    continue
                if prio < 0:
                    continue
                fullname = html.escape(fullname)
                dispname = html.escape(dispname)
                prefix, _, name = dispname.rpartition('.')
                pdict = rv.setdefault(prefix, {})
                try:
                    typeindex = otypes[domainname, type]
                except KeyError:
                    typeindex = len(otypes)
                    otypes[domainname, type] = typeindex
                    otype = domain.object_types.get(type)
                    if otype:
                        # use str() to fire translation proxies
                        onames[typeindex] = (domainname, type,
                                             str(domain.get_type_name(otype)))
                    else:
                        onames[typeindex] = (domainname, type, type)
                if anchor == fullname:
                    shortanchor = ''
                elif anchor == type + '-' + fullname:
                    shortanchor = '-'
                else:
                    shortanchor = anchor
                pdict[name] = (fn2index[docname], typeindex, prio, shortanchor)
        return rv

    def get_terms(self, fn2index: Dict) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]:
        rvs = {}, {}  # type: Tuple[Dict[str, List[str]], Dict[str, List[str]]]
        for rv, mapping in zip(rvs, (self._mapping, self._title_mapping)):
            for k, v in mapping.items():  # type: ignore
                if len(v) == 1:
                    fn, = v
                    if fn in fn2index:
                        rv[k] = fn2index[fn]  # type: ignore
                else:
                    rv[k] = sorted([fn2index[fn] for fn in v if fn in fn2index])  # type: ignore  # NOQA
        return rvs

    def freeze(self) -> Dict[str, Any]:
        """Create a usable data structure for serializing."""
        docnames, titles = zip(*sorted(self._titles.items()))
        filenames = [self._filenames.get(docname) for docname in docnames]
        fn2index = {f: i for (i, f) in enumerate(docnames)}
        terms, title_terms = self.get_terms(fn2index)

        objects = self.get_objects(fn2index)  # populates _objtypes
        objtypes = {v: k[0] + ':' + k[1] for (k, v) in self._objtypes.items()}
        objnames = self._objnames
        return dict(docnames=docnames, filenames=filenames, titles=titles, terms=terms,
                    objects=objects, objtypes=objtypes, objnames=objnames,
                    titleterms=title_terms, envversion=self.env.version)

    def label(self) -> str:
        return "%s (code: %s)" % (self.lang.language_name, self.lang.lang)

    def prune(self, docnames: Iterable[str]) -> None:
        """Remove data for all docnames not in the list."""
        new_titles = {}
        new_filenames = {}
        for docname in docnames:
            if docname in self._titles:
                new_titles[docname] = self._titles[docname]
                new_filenames[docname] = self._filenames[docname]
        self._titles = new_titles
        self._filenames = new_filenames
        for wordnames in self._mapping.values():
            wordnames.intersection_update(docnames)
        for wordnames in self._title_mapping.values():
            wordnames.intersection_update(docnames)

    def feed(self, docname: str, filename: str, title: str, doctree: nodes.document) -> None:
        """Feed a doctree to the index."""
        self._titles[docname] = title
        self._filenames[docname] = filename

        visitor = WordCollector(doctree, self.lang)
        doctree.walk(visitor)

        # memoize self.lang.stem
        def stem(word: str) -> str:
            try:
                return self._stem_cache[word]
            except KeyError:
                self._stem_cache[word] = self.lang.stem(word).lower()
                return self._stem_cache[word]
        _filter = self.lang.word_filter

        for word in visitor.found_title_words:
            stemmed_word = stem(word)
            if _filter(stemmed_word):
                self._title_mapping.setdefault(stemmed_word, set()).add(docname)
            elif _filter(word): # stemmer must not remove words from search index
                self._title_mapping.setdefault(word, set()).add(docname)

        for word in visitor.found_words:
            stemmed_word = stem(word)
            # again, stemmer must not remove words from search index
            if not _filter(stemmed_word) and _filter(word):
                stemmed_word = word
            already_indexed = docname in self._title_mapping.get(stemmed_word, set())
            if _filter(stemmed_word) and not already_indexed:
                self._mapping.setdefault(stemmed_word, set()).add(docname)

    def context_for_searchtool(self) -> Dict[str, Any]:
        if self.lang.js_splitter_code:
            js_splitter_code = self.lang.js_splitter_code
        else:
            js_splitter_code = self.js_splitter_code

        return {
            'search_language_stemming_code': self.lang.js_stemmer_code,
            'search_language_stop_words': jsdump.dumps(sorted(self.lang.stopwords)),
            'search_scorer_tool': self.js_scorer_code,
            'search_word_splitter_code': js_splitter_code,
        }

    def get_js_stemmer_rawcode(self) -> str:
        if self.lang.js_stemmer_rawcode:
            return path.join(package_dir, 'search', 'non-minified-js',
                             self.lang.js_stemmer_rawcode)
        else:
            return None
示例#7
0
class IndexBuilder(object):
    """
    Helper class that creates a searchindex based on the doctrees
    passed to the `feed` method.
    """
    formats = {
        'jsdump':   jsdump,
        'pickle':   pickle
    }

    def __init__(self, env, lang, options, scoring):
        self.env = env
        # filename -> title
        self._titles = {}
        # stemmed word -> set(filenames)
        self._mapping = {}
        # stemmed words in titles -> set(filenames)
        self._title_mapping = {}
        # word -> stemmed word
        self._stem_cache = {}
        # objtype -> index
        self._objtypes = {}
        # objtype index -> (domain, type, objname (localized))
        self._objnames = {}
        # add language-specific SearchLanguage instance
        lang_class = languages.get(lang)
        if lang_class is None:
            self.lang = SearchEnglish(options)
        elif isinstance(lang_class, str):
            module, classname = lang_class.rsplit('.', 1)
            lang_class = getattr(__import__(module, None, None, [classname]),
                                 classname)
            self.lang = lang_class(options)
        else:
            # it's directly a class (e.g. added by app.add_search_language)
            self.lang = lang_class(options)

        if scoring:
            with open(scoring, 'rb') as fp:
                self.js_scorer_code = fp.read().decode('utf-8')
        else:
            self.js_scorer_code = u''

    def load(self, stream, format):
        """Reconstruct from frozen data."""
        if isinstance(format, string_types):
            format = self.formats[format]
        frozen = format.load(stream)
        # if an old index is present, we treat it as not existing.
        if not isinstance(frozen, dict) or \
           frozen.get('envversion') != self.env.version:
            raise ValueError('old format')
        index2fn = frozen['filenames']
        self._titles = dict(zip(index2fn, frozen['titles']))

        def load_terms(mapping):
            rv = {}
            for k, v in iteritems(mapping):
                if isinstance(v, int):
                    rv[k] = set([index2fn[v]])
                else:
                    rv[k] = set(index2fn[i] for i in v)
            return rv

        self._mapping = load_terms(frozen['terms'])
        self._title_mapping = load_terms(frozen['titleterms'])
        # no need to load keywords/objtypes

    def dump(self, stream, format):
        """Dump the frozen index to a stream."""
        if isinstance(format, string_types):
            format = self.formats[format]
        format.dump(self.freeze(), stream)

    def get_objects(self, fn2index):
        rv = {}
        otypes = self._objtypes
        onames = self._objnames
        for domainname, domain in sorted(iteritems(self.env.domains)):
            for fullname, dispname, type, docname, anchor, prio in \
                    sorted(domain.get_objects()):
                # XXX use dispname?
                if docname not in fn2index:
                    continue
                if prio < 0:
                    continue
                fullname = htmlescape(fullname)
                prefix, name = rpartition(fullname, '.')
                pdict = rv.setdefault(prefix, {})
                try:
                    typeindex = otypes[domainname, type]
                except KeyError:
                    typeindex = len(otypes)
                    otypes[domainname, type] = typeindex
                    otype = domain.object_types.get(type)
                    if otype:
                        # use unicode() to fire translation proxies
                        onames[typeindex] = (domainname, type,
                                             text_type(domain.get_type_name(otype)))
                    else:
                        onames[typeindex] = (domainname, type, type)
                if anchor == fullname:
                    shortanchor = ''
                elif anchor == type + '-' + fullname:
                    shortanchor = '-'
                else:
                    shortanchor = anchor
                pdict[name] = (fn2index[docname], typeindex, prio, shortanchor)
        return rv

    def get_terms(self, fn2index):
        rvs = {}, {}
        for rv, mapping in zip(rvs, (self._mapping, self._title_mapping)):
            for k, v in iteritems(mapping):
                if len(v) == 1:
                    fn, = v
                    if fn in fn2index:
                        rv[k] = fn2index[fn]
                else:
                    rv[k] = sorted([fn2index[fn] for fn in v if fn in fn2index])
        return rvs

    def freeze(self):
        """Create a usable data structure for serializing."""
        filenames, titles = zip(*sorted(self._titles.items()))
        fn2index = dict((f, i) for (i, f) in enumerate(filenames))
        terms, title_terms = self.get_terms(fn2index)

        objects = self.get_objects(fn2index)  # populates _objtypes
        objtypes = dict((v, k[0] + ':' + k[1])
                        for (k, v) in iteritems(self._objtypes))
        objnames = self._objnames
        return dict(filenames=filenames, titles=titles, terms=terms,
                    objects=objects, objtypes=objtypes, objnames=objnames,
                    titleterms=title_terms, envversion=self.env.version)

    def label(self):
        return "%s (code: %s)" % (self.lang.language_name, self.lang.lang)

    def prune(self, filenames):
        """Remove data for all filenames not in the list."""
        new_titles = {}
        for filename in filenames:
            if filename in self._titles:
                new_titles[filename] = self._titles[filename]
        self._titles = new_titles
        for wordnames in itervalues(self._mapping):
            wordnames.intersection_update(filenames)
        for wordnames in itervalues(self._title_mapping):
            wordnames.intersection_update(filenames)

    def feed(self, filename, title, doctree):
        """Feed a doctree to the index."""
        self._titles[filename] = title
        visitor = WordCollector(doctree, self.lang)
        doctree.walk(visitor)

        # memoize self.lang.stem
        def stem(word):
            try:
                return self._stem_cache[word]
            except KeyError:
                self._stem_cache[word] = self.lang.stem(word)
                return self._stem_cache[word]
        _filter = self.lang.word_filter

        for word in visitor.found_title_words:
            word = stem(word)
            if _filter(word):
                self._title_mapping.setdefault(word, set()).add(filename)

        for word in visitor.found_words:
            word = stem(word)
            if word not in self._title_mapping and _filter(word):
                self._mapping.setdefault(word, set()).add(filename)

    def context_for_searchtool(self):
        return dict(
            search_language_stemming_code = self.lang.js_stemmer_code,
            search_language_stop_words = jsdump.dumps(sorted(self.lang.stopwords)),
            search_scorer_tool = self.js_scorer_code,
        )

    def get_js_stemmer_rawcode(self):
        if self.lang.js_stemmer_rawcode:
            return path.join(
                path.dirname(path.abspath(__file__)),
                'non-minified-js',
                self.lang.js_stemmer_rawcode
            )
示例#8
0
class IndexBuilder(object):
    """
    Helper class that creates a searchindex based on the doctrees
    passed to the `feed` method.
    """
    formats = {
        'jsdump':   jsdump,
        'pickle':   pickle
    }

    def __init__(self, env, lang, options, scoring):
        self.env = env
        # filename -> title
        self._titles = {}
        # stemmed word -> set(filenames)
        self._mapping = {}
        # stemmed words in titles -> set(filenames)
        self._title_mapping = {}
        # word -> stemmed word
        self._stem_cache = {}
        # objtype -> index
        self._objtypes = {}
        # objtype index -> (domain, type, objname (localized))
        self._objnames = {}
        # add language-specific SearchLanguage instance
        lang_class = languages.get(lang)
        if lang_class is None:
            self.lang = SearchEnglish(options)
        elif isinstance(lang_class, str):
            module, classname = lang_class.rsplit('.', 1)
            lang_class = getattr(__import__(module, None, None, [classname]),
                                 classname)
            self.lang = lang_class(options)
        else:
            # it's directly a class (e.g. added by app.add_search_language)
            self.lang = lang_class(options)

        if scoring:
            with open(scoring, 'rb') as fp:
                self.js_scorer_code = fp.read().decode('utf-8')
        else:
            self.js_scorer_code = u''

    def load(self, stream, format):
        """Reconstruct from frozen data."""
        if isinstance(format, string_types):
            format = self.formats[format]
        frozen = format.load(stream)
        # if an old index is present, we treat it as not existing.
        if not isinstance(frozen, dict) or \
           frozen.get('envversion') != self.env.version:
            raise ValueError('old format')
        index2fn = frozen['filenames']
        self._titles = dict(zip(index2fn, frozen['titles']))

        def load_terms(mapping):
            rv = {}
            for k, v in iteritems(mapping):
                if isinstance(v, int):
                    rv[k] = set([index2fn[v]])
                else:
                    rv[k] = set(index2fn[i] for i in v)
            return rv

        self._mapping = load_terms(frozen['terms'])
        self._title_mapping = load_terms(frozen['titleterms'])
        # no need to load keywords/objtypes

    def dump(self, stream, format):
        """Dump the frozen index to a stream."""
        if isinstance(format, string_types):
            format = self.formats[format]
        format.dump(self.freeze(), stream)

    def get_objects(self, fn2index):
        rv = {}
        otypes = self._objtypes
        onames = self._objnames
        for domainname, domain in iteritems(self.env.domains):
            for fullname, dispname, type, docname, anchor, prio in \
                    domain.get_objects():
                # XXX use dispname?
                if docname not in fn2index:
                    continue
                if prio < 0:
                    continue
                prefix, name = rpartition(fullname, '.')
                pdict = rv.setdefault(prefix, {})
                try:
                    typeindex = otypes[domainname, type]
                except KeyError:
                    typeindex = len(otypes)
                    otypes[domainname, type] = typeindex
                    otype = domain.object_types.get(type)
                    if otype:
                        # use unicode() to fire translation proxies
                        onames[typeindex] = (domainname, type,
                            text_type(domain.get_type_name(otype)))
                    else:
                        onames[typeindex] = (domainname, type, type)
                if anchor == fullname:
                    shortanchor = ''
                elif anchor == type + '-' + fullname:
                    shortanchor = '-'
                else:
                    shortanchor = anchor
                pdict[name] = (fn2index[docname], typeindex, prio, shortanchor)
        return rv

    def get_terms(self, fn2index):
        rvs = {}, {}
        for rv, mapping in zip(rvs, (self._mapping, self._title_mapping)):
            for k, v in iteritems(mapping):
                if len(v) == 1:
                    fn, = v
                    if fn in fn2index:
                        rv[k] = fn2index[fn]
                else:
                    rv[k] = [fn2index[fn] for fn in v if fn in fn2index]
        return rvs

    def freeze(self):
        """Create a usable data structure for serializing."""
        filenames = list(self._titles.keys())
        titles = list(self._titles.values())
        fn2index = dict((f, i) for (i, f) in enumerate(filenames))
        terms, title_terms = self.get_terms(fn2index)

        objects = self.get_objects(fn2index)  # populates _objtypes
        objtypes = dict((v, k[0] + ':' + k[1])
                        for (k, v) in iteritems(self._objtypes))
        objnames = self._objnames
        return dict(filenames=filenames, titles=titles, terms=terms,
                    objects=objects, objtypes=objtypes, objnames=objnames,
                    titleterms=title_terms, envversion=self.env.version)

    def label(self):
        return "%s (code: %s)" % (self.lang.language_name, self.lang.lang)

    def prune(self, filenames):
        """Remove data for all filenames not in the list."""
        new_titles = {}
        for filename in filenames:
            if filename in self._titles:
                new_titles[filename] = self._titles[filename]
        self._titles = new_titles
        for wordnames in itervalues(self._mapping):
            wordnames.intersection_update(filenames)
        for wordnames in itervalues(self._title_mapping):
            wordnames.intersection_update(filenames)

    def feed(self, filename, title, doctree):
        """Feed a doctree to the index."""
        self._titles[filename] = title

        visitor = WordCollector(doctree, self.lang)
        doctree.walk(visitor)

        # memoize self.lang.stem
        def stem(word):
            try:
                return self._stem_cache[word]
            except KeyError:
                self._stem_cache[word] = self.lang.stem(word)
                return self._stem_cache[word]
        _filter =  self.lang.word_filter

        for word in visitor.found_title_words:
            word = stem(word)
            if _filter(word):
                self._title_mapping.setdefault(word, set()).add(filename)

        for word in visitor.found_words:
            word = stem(word)
            if word not in self._title_mapping and _filter(word):
                self._mapping.setdefault(word, set()).add(filename)

    def context_for_searchtool(self):
        return dict(
            search_language_stemming_code = self.lang.js_stemmer_code,
            search_language_stop_words =
                jsdump.dumps(sorted(self.lang.stopwords)),
            search_scorer_tool = self.js_scorer_code,
        )