def __init__(self, env, lang, options, scoring): self.env = env # filename -> title self._titles = {} # stemmed word -> set(filenames) self._mapping = {} # stemmed words in titles -> set(filenames) self._title_mapping = {} # word -> stemmed word self._stem_cache = {} # objtype -> index self._objtypes = {} # objtype index -> (domain, type, objname (localized)) self._objnames = {} # add language-specific SearchLanguage instance lang_class = languages.get(lang) if lang_class is None: self.lang = SearchEnglish(options) elif isinstance(lang_class, str): module, classname = lang_class.rsplit('.', 1) lang_class = getattr(__import__(module, None, None, [classname]), classname) self.lang = lang_class(options) else: # it's directly a class (e.g. added by app.add_search_language) self.lang = lang_class(options) if scoring: with open(scoring, 'rb') as fp: self.js_scorer_code = fp.read().decode('utf-8') else: self.js_scorer_code = u''
def __init__(self, env, lang, options, scoring): # type: (BuildEnvironment, unicode, Dict, unicode) -> None self.env = env self._titles = {} # type: Dict[unicode, unicode] # docname -> title self._filenames = {} # type: Dict[unicode, unicode] # docname -> filename self._mapping = {} # type: Dict[unicode, Set[unicode]] # stemmed word -> set(docname) self._title_mapping = {} # type: Dict[unicode, Set[unicode]] # stemmed words in titles -> set(docname) self._stem_cache = {} # type: Dict[unicode, unicode] # word -> stemmed word self._objtypes = {} # type: Dict[Tuple[unicode, unicode], int] # objtype -> index self._objnames = {} # type: Dict[int, Tuple[unicode, unicode, unicode]] # objtype index -> (domain, type, objname (localized)) lang_class = languages.get(lang) # type: Type[SearchLanguage] # add language-specific SearchLanguage instance if lang_class is None: self.lang = SearchEnglish(options) # type: SearchLanguage elif isinstance(lang_class, str): module, classname = lang_class.rsplit('.', 1) lang_class = getattr(__import__(module, None, None, [classname]), classname) self.lang = lang_class(options) else: # it's directly a class (e.g. added by app.add_search_language) self.lang = lang_class(options) if scoring: with open(scoring, 'rb') as fp: self.js_scorer_code = fp.read().decode('utf-8') else: self.js_scorer_code = u''
def __init__(self, env: BuildEnvironment, lang: str, options: Dict, scoring: str) -> None: self.env = env self._titles: Dict[str, str] = {} # docname -> title self._filenames: Dict[str, str] = {} # docname -> filename self._mapping: Dict[str, Set[str]] = {} # stemmed word -> set(docname) # stemmed words in titles -> set(docname) self._title_mapping: Dict[str, Set[str]] = {} self._stem_cache: Dict[str, str] = {} # word -> stemmed word self._objtypes: Dict[Tuple[str, str], int] = {} # objtype -> index # objtype index -> (domain, type, objname (localized)) self._objnames: Dict[int, Tuple[str, str, str]] = {} # add language-specific SearchLanguage instance lang_class: Type[SearchLanguage] = languages.get(lang) # fallback; try again with language-code if lang_class is None and '_' in lang: lang_class = languages.get(lang.split('_')[0]) if lang_class is None: self.lang: SearchLanguage = SearchEnglish(options) elif isinstance(lang_class, str): module, classname = lang_class.rsplit('.', 1) lang_class = getattr(import_module(module), classname) self.lang = lang_class(options) else: # it's directly a class (e.g. added by app.add_search_language) self.lang = lang_class(options) if scoring: with open(scoring, 'rb') as fp: self.js_scorer_code = fp.read().decode() else: self.js_scorer_code = '' self.js_splitter_code = splitter_code
def __init__(self, env, lang, options, scoring): # type: (BuildEnvironment, unicode, Dict, unicode) -> None self.env = env self._titles = {} # type: Dict[unicode, unicode] # docname -> title self._filenames = {} # type: Dict[unicode, unicode] # docname -> filename self._mapping = {} # type: Dict[unicode, Set[unicode]] # stemmed word -> set(docname) self._title_mapping = {} # type: Dict[unicode, Set[unicode]] # stemmed words in titles -> set(docname) self._stem_cache = {} # type: Dict[unicode, unicode] # word -> stemmed word self._objtypes = {} # type: Dict[Tuple[unicode, unicode], int] # objtype -> index self._objnames = {} # type: Dict[int, Tuple[unicode, unicode, unicode]] # objtype index -> (domain, type, objname (localized)) lang_class = languages.get(lang) # type: Type[SearchLanguage] # add language-specific SearchLanguage instance # fallback; try again with language-code if lang_class is None and '_' in lang: lang_class = languages.get(lang.split('_')[0]) if lang_class is None: self.lang = SearchEnglish(options) # type: SearchLanguage elif isinstance(lang_class, str): module, classname = lang_class.rsplit('.', 1) lang_class = getattr(__import__(module, None, None, [classname]), classname) self.lang = lang_class(options) else: # it's directly a class (e.g. added by app.add_search_language) self.lang = lang_class(options) if scoring: with open(scoring, 'rb') as fp: self.js_scorer_code = fp.read().decode('utf-8') else: self.js_scorer_code = u'' self.js_splitter_code = splitter_code
class IndexBuilder(object): """ Helper class that creates a searchindex based on the doctrees passed to the `feed` method. """ formats = { 'jsdump': jsdump, 'pickle': pickle } # type: Dict[unicode, Any] def __init__(self, env, lang, options, scoring): # type: (BuildEnvironment, unicode, Dict, unicode) -> None self.env = env self._titles = {} # type: Dict[unicode, unicode] # docname -> title self._filenames = {} # type: Dict[unicode, unicode] # docname -> filename self._mapping = {} # type: Dict[unicode, Set[unicode]] # stemmed word -> set(docname) self._title_mapping = {} # type: Dict[unicode, Set[unicode]] # stemmed words in titles -> set(docname) self._stem_cache = {} # type: Dict[unicode, unicode] # word -> stemmed word self._objtypes = {} # type: Dict[Tuple[unicode, unicode], int] # objtype -> index self._objnames = {} # type: Dict[int, Tuple[unicode, unicode, unicode]] # objtype index -> (domain, type, objname (localized)) lang_class = languages.get(lang) # type: Type[SearchLanguage] # add language-specific SearchLanguage instance if lang_class is None: self.lang = SearchEnglish(options) # type: SearchLanguage elif isinstance(lang_class, str): module, classname = lang_class.rsplit('.', 1) lang_class = getattr(__import__(module, None, None, [classname]), classname) self.lang = lang_class(options) else: # it's directly a class (e.g. added by app.add_search_language) self.lang = lang_class(options) if scoring: with open(scoring, 'rb') as fp: self.js_scorer_code = fp.read().decode('utf-8') else: self.js_scorer_code = u'' self.js_splitter_code = splitter_code def load(self, stream, format): # type: (IO, Any) -> None """Reconstruct from frozen data.""" if isinstance(format, string_types): format = self.formats[format] frozen = format.load(stream) # if an old index is present, we treat it as not existing. if not isinstance(frozen, dict) or \ frozen.get('envversion') != self.env.version: raise ValueError('old format') index2fn = frozen['docnames'] self._filenames = dict(zip(index2fn, frozen['filenames'])) self._titles = dict(zip(index2fn, frozen['titles'])) def load_terms(mapping): # type: (Dict[unicode, Any]) -> Dict[unicode, Set[unicode]] rv = {} for k, v in iteritems(mapping): if isinstance(v, int): rv[k] = set([index2fn[v]]) else: rv[k] = set(index2fn[i] for i in v) return rv self._mapping = load_terms(frozen['terms']) self._title_mapping = load_terms(frozen['titleterms']) # no need to load keywords/objtypes def dump(self, stream, format): # type: (IO, Any) -> None """Dump the frozen index to a stream.""" if isinstance(format, string_types): format = self.formats[format] format.dump(self.freeze(), stream) def get_objects(self, fn2index): # type: (Dict[unicode, int]) -> Dict[unicode, Dict[unicode, Tuple[int, int, int, unicode]]] # NOQA rv = {} # type: Dict[unicode, Dict[unicode, Tuple[int, int, int, unicode]]] otypes = self._objtypes onames = self._objnames for domainname, domain in sorted(iteritems(self.env.domains)): for fullname, dispname, type, docname, anchor, prio in \ sorted(domain.get_objects()): # XXX use dispname? if docname not in fn2index: continue if prio < 0: continue fullname = htmlescape(fullname) prefix, name = rpartition(fullname, '.') pdict = rv.setdefault(prefix, {}) try: typeindex = otypes[domainname, type] except KeyError: typeindex = len(otypes) otypes[domainname, type] = typeindex otype = domain.object_types.get(type) if otype: # use unicode() to fire translation proxies onames[typeindex] = (domainname, type, text_type(domain.get_type_name(otype))) else: onames[typeindex] = (domainname, type, type) if anchor == fullname: shortanchor = '' # type: unicode elif anchor == type + '-' + fullname: shortanchor = '-' else: shortanchor = anchor pdict[name] = (fn2index[docname], typeindex, prio, shortanchor) return rv def get_terms(self, fn2index): # type: (Dict) -> Tuple[Dict[unicode, List[unicode]], Dict[unicode, List[unicode]]] rvs = {}, {} # type: Tuple[Dict[unicode, List[unicode]], Dict[unicode, List[unicode]]] for rv, mapping in zip(rvs, (self._mapping, self._title_mapping)): for k, v in iteritems(mapping): if len(v) == 1: fn, = v if fn in fn2index: rv[k] = fn2index[fn] else: rv[k] = sorted([fn2index[fn] for fn in v if fn in fn2index]) return rvs def freeze(self): # type: () -> Dict[unicode, Any] """Create a usable data structure for serializing.""" docnames, titles = zip(*sorted(self._titles.items())) filenames = [self._filenames.get(docname) for docname in docnames] fn2index = dict((f, i) for (i, f) in enumerate(docnames)) terms, title_terms = self.get_terms(fn2index) objects = self.get_objects(fn2index) # populates _objtypes objtypes = dict((v, k[0] + ':' + k[1]) for (k, v) in iteritems(self._objtypes)) objnames = self._objnames return dict(docnames=docnames, filenames=filenames, titles=titles, terms=terms, objects=objects, objtypes=objtypes, objnames=objnames, titleterms=title_terms, envversion=self.env.version) def label(self): # type: () -> unicode return "%s (code: %s)" % (self.lang.language_name, self.lang.lang) def prune(self, docnames): # type: (Iterable[unicode]) -> None """Remove data for all docnames not in the list.""" new_titles = {} new_filenames = {} for docname in docnames: if docname in self._titles: new_titles[docname] = self._titles[docname] new_filenames[docname] = self._filenames[docname] self._titles = new_titles self._filenames = new_filenames for wordnames in itervalues(self._mapping): wordnames.intersection_update(docnames) for wordnames in itervalues(self._title_mapping): wordnames.intersection_update(docnames) def feed(self, docname, filename, title, doctree): # type: (unicode, unicode, unicode, nodes.Node) -> None """Feed a doctree to the index.""" self._titles[docname] = title self._filenames[docname] = filename visitor = WordCollector(doctree, self.lang) doctree.walk(visitor) # memoize self.lang.stem def stem(word): # type: (unicode) -> unicode try: return self._stem_cache[word] except KeyError: self._stem_cache[word] = self.lang.stem(word).lower() return self._stem_cache[word] _filter = self.lang.word_filter for word in visitor.found_title_words: stemmed_word = stem(word) if _filter(stemmed_word): self._title_mapping.setdefault(stemmed_word, set()).add(docname) elif _filter(word): # stemmer must not remove words from search index self._title_mapping.setdefault(word, set()).add(docname) for word in visitor.found_words: stemmed_word = stem(word) # again, stemmer must not remove words from search index if not _filter(stemmed_word) and _filter(word): stemmed_word = word already_indexed = docname in self._title_mapping.get(stemmed_word, set()) if _filter(stemmed_word) and not already_indexed: self._mapping.setdefault(stemmed_word, set()).add(docname) def context_for_searchtool(self): # type: () -> Dict[unicode, Any] return dict( search_language_stemming_code = self.lang.js_stemmer_code, search_language_stop_words = jsdump.dumps(sorted(self.lang.stopwords)), search_scorer_tool = self.js_scorer_code, search_word_splitter_code = self.js_splitter_code, ) def get_js_stemmer_rawcode(self): # type: () -> unicode if self.lang.js_stemmer_rawcode: return path.join( sphinx.package_dir, 'search', 'non-minified-js', self.lang.js_stemmer_rawcode ) else: return None
class IndexBuilder: """ Helper class that creates a searchindex based on the doctrees passed to the `feed` method. """ formats = { 'jsdump': jsdump, 'pickle': pickle } def __init__(self, env: BuildEnvironment, lang: str, options: Dict, scoring: str) -> None: self.env = env self._titles = {} # type: Dict[str, str] # docname -> title self._filenames = {} # type: Dict[str, str] # docname -> filename self._mapping = {} # type: Dict[str, Set[str]] # stemmed word -> set(docname) self._title_mapping = {} # type: Dict[str, Set[str]] # stemmed words in titles -> set(docname) self._stem_cache = {} # type: Dict[str, str] # word -> stemmed word self._objtypes = {} # type: Dict[Tuple[str, str], int] # objtype -> index self._objnames = {} # type: Dict[int, Tuple[str, str, str]] # objtype index -> (domain, type, objname (localized)) lang_class = languages.get(lang) # type: Type[SearchLanguage] # add language-specific SearchLanguage instance # fallback; try again with language-code if lang_class is None and '_' in lang: lang_class = languages.get(lang.split('_')[0]) if lang_class is None: self.lang = SearchEnglish(options) # type: SearchLanguage elif isinstance(lang_class, str): module, classname = lang_class.rsplit('.', 1) lang_class = getattr(import_module(module), classname) self.lang = lang_class(options) else: # it's directly a class (e.g. added by app.add_search_language) self.lang = lang_class(options) if scoring: with open(scoring, 'rb') as fp: self.js_scorer_code = fp.read().decode() else: self.js_scorer_code = '' self.js_splitter_code = splitter_code def load(self, stream: IO, format: Any) -> None: """Reconstruct from frozen data.""" if isinstance(format, str): format = self.formats[format] frozen = format.load(stream) # if an old index is present, we treat it as not existing. if not isinstance(frozen, dict) or \ frozen.get('envversion') != self.env.version: raise ValueError('old format') index2fn = frozen['docnames'] self._filenames = dict(zip(index2fn, frozen['filenames'])) # type: ignore self._titles = dict(zip(index2fn, frozen['titles'])) # type: ignore def load_terms(mapping: Dict[str, Any]) -> Dict[str, Set[str]]: rv = {} for k, v in mapping.items(): if isinstance(v, int): rv[k] = {index2fn[v]} else: rv[k] = {index2fn[i] for i in v} return rv self._mapping = load_terms(frozen['terms']) self._title_mapping = load_terms(frozen['titleterms']) # no need to load keywords/objtypes def dump(self, stream: IO, format: Any) -> None: """Dump the frozen index to a stream.""" if isinstance(format, str): format = self.formats[format] format.dump(self.freeze(), stream) def get_objects(self, fn2index: Dict[str, int] ) -> Dict[str, Dict[str, Tuple[int, int, int, str]]]: rv = {} # type: Dict[str, Dict[str, Tuple[int, int, int, str]]] otypes = self._objtypes onames = self._objnames for domainname, domain in sorted(self.env.domains.items()): for fullname, dispname, type, docname, anchor, prio in \ sorted(domain.get_objects()): if docname not in fn2index: continue if prio < 0: continue fullname = html.escape(fullname) dispname = html.escape(dispname) prefix, _, name = dispname.rpartition('.') pdict = rv.setdefault(prefix, {}) try: typeindex = otypes[domainname, type] except KeyError: typeindex = len(otypes) otypes[domainname, type] = typeindex otype = domain.object_types.get(type) if otype: # use str() to fire translation proxies onames[typeindex] = (domainname, type, str(domain.get_type_name(otype))) else: onames[typeindex] = (domainname, type, type) if anchor == fullname: shortanchor = '' elif anchor == type + '-' + fullname: shortanchor = '-' else: shortanchor = anchor pdict[name] = (fn2index[docname], typeindex, prio, shortanchor) return rv def get_terms(self, fn2index: Dict) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]: rvs = {}, {} # type: Tuple[Dict[str, List[str]], Dict[str, List[str]]] for rv, mapping in zip(rvs, (self._mapping, self._title_mapping)): for k, v in mapping.items(): # type: ignore if len(v) == 1: fn, = v if fn in fn2index: rv[k] = fn2index[fn] # type: ignore else: rv[k] = sorted([fn2index[fn] for fn in v if fn in fn2index]) # type: ignore # NOQA return rvs def freeze(self) -> Dict[str, Any]: """Create a usable data structure for serializing.""" docnames, titles = zip(*sorted(self._titles.items())) filenames = [self._filenames.get(docname) for docname in docnames] fn2index = {f: i for (i, f) in enumerate(docnames)} terms, title_terms = self.get_terms(fn2index) objects = self.get_objects(fn2index) # populates _objtypes objtypes = {v: k[0] + ':' + k[1] for (k, v) in self._objtypes.items()} objnames = self._objnames return dict(docnames=docnames, filenames=filenames, titles=titles, terms=terms, objects=objects, objtypes=objtypes, objnames=objnames, titleterms=title_terms, envversion=self.env.version) def label(self) -> str: return "%s (code: %s)" % (self.lang.language_name, self.lang.lang) def prune(self, docnames: Iterable[str]) -> None: """Remove data for all docnames not in the list.""" new_titles = {} new_filenames = {} for docname in docnames: if docname in self._titles: new_titles[docname] = self._titles[docname] new_filenames[docname] = self._filenames[docname] self._titles = new_titles self._filenames = new_filenames for wordnames in self._mapping.values(): wordnames.intersection_update(docnames) for wordnames in self._title_mapping.values(): wordnames.intersection_update(docnames) def feed(self, docname: str, filename: str, title: str, doctree: nodes.document) -> None: """Feed a doctree to the index.""" self._titles[docname] = title self._filenames[docname] = filename visitor = WordCollector(doctree, self.lang) doctree.walk(visitor) # memoize self.lang.stem def stem(word: str) -> str: try: return self._stem_cache[word] except KeyError: self._stem_cache[word] = self.lang.stem(word).lower() return self._stem_cache[word] _filter = self.lang.word_filter for word in visitor.found_title_words: stemmed_word = stem(word) if _filter(stemmed_word): self._title_mapping.setdefault(stemmed_word, set()).add(docname) elif _filter(word): # stemmer must not remove words from search index self._title_mapping.setdefault(word, set()).add(docname) for word in visitor.found_words: stemmed_word = stem(word) # again, stemmer must not remove words from search index if not _filter(stemmed_word) and _filter(word): stemmed_word = word already_indexed = docname in self._title_mapping.get(stemmed_word, set()) if _filter(stemmed_word) and not already_indexed: self._mapping.setdefault(stemmed_word, set()).add(docname) def context_for_searchtool(self) -> Dict[str, Any]: if self.lang.js_splitter_code: js_splitter_code = self.lang.js_splitter_code else: js_splitter_code = self.js_splitter_code return { 'search_language_stemming_code': self.lang.js_stemmer_code, 'search_language_stop_words': jsdump.dumps(sorted(self.lang.stopwords)), 'search_scorer_tool': self.js_scorer_code, 'search_word_splitter_code': js_splitter_code, } def get_js_stemmer_rawcode(self) -> str: if self.lang.js_stemmer_rawcode: return path.join(package_dir, 'search', 'non-minified-js', self.lang.js_stemmer_rawcode) else: return None
class IndexBuilder(object): """ Helper class that creates a searchindex based on the doctrees passed to the `feed` method. """ formats = { 'jsdump': jsdump, 'pickle': pickle } def __init__(self, env, lang, options, scoring): self.env = env # filename -> title self._titles = {} # stemmed word -> set(filenames) self._mapping = {} # stemmed words in titles -> set(filenames) self._title_mapping = {} # word -> stemmed word self._stem_cache = {} # objtype -> index self._objtypes = {} # objtype index -> (domain, type, objname (localized)) self._objnames = {} # add language-specific SearchLanguage instance lang_class = languages.get(lang) if lang_class is None: self.lang = SearchEnglish(options) elif isinstance(lang_class, str): module, classname = lang_class.rsplit('.', 1) lang_class = getattr(__import__(module, None, None, [classname]), classname) self.lang = lang_class(options) else: # it's directly a class (e.g. added by app.add_search_language) self.lang = lang_class(options) if scoring: with open(scoring, 'rb') as fp: self.js_scorer_code = fp.read().decode('utf-8') else: self.js_scorer_code = u'' def load(self, stream, format): """Reconstruct from frozen data.""" if isinstance(format, string_types): format = self.formats[format] frozen = format.load(stream) # if an old index is present, we treat it as not existing. if not isinstance(frozen, dict) or \ frozen.get('envversion') != self.env.version: raise ValueError('old format') index2fn = frozen['filenames'] self._titles = dict(zip(index2fn, frozen['titles'])) def load_terms(mapping): rv = {} for k, v in iteritems(mapping): if isinstance(v, int): rv[k] = set([index2fn[v]]) else: rv[k] = set(index2fn[i] for i in v) return rv self._mapping = load_terms(frozen['terms']) self._title_mapping = load_terms(frozen['titleterms']) # no need to load keywords/objtypes def dump(self, stream, format): """Dump the frozen index to a stream.""" if isinstance(format, string_types): format = self.formats[format] format.dump(self.freeze(), stream) def get_objects(self, fn2index): rv = {} otypes = self._objtypes onames = self._objnames for domainname, domain in sorted(iteritems(self.env.domains)): for fullname, dispname, type, docname, anchor, prio in \ sorted(domain.get_objects()): # XXX use dispname? if docname not in fn2index: continue if prio < 0: continue fullname = htmlescape(fullname) prefix, name = rpartition(fullname, '.') pdict = rv.setdefault(prefix, {}) try: typeindex = otypes[domainname, type] except KeyError: typeindex = len(otypes) otypes[domainname, type] = typeindex otype = domain.object_types.get(type) if otype: # use unicode() to fire translation proxies onames[typeindex] = (domainname, type, text_type(domain.get_type_name(otype))) else: onames[typeindex] = (domainname, type, type) if anchor == fullname: shortanchor = '' elif anchor == type + '-' + fullname: shortanchor = '-' else: shortanchor = anchor pdict[name] = (fn2index[docname], typeindex, prio, shortanchor) return rv def get_terms(self, fn2index): rvs = {}, {} for rv, mapping in zip(rvs, (self._mapping, self._title_mapping)): for k, v in iteritems(mapping): if len(v) == 1: fn, = v if fn in fn2index: rv[k] = fn2index[fn] else: rv[k] = sorted([fn2index[fn] for fn in v if fn in fn2index]) return rvs def freeze(self): """Create a usable data structure for serializing.""" filenames, titles = zip(*sorted(self._titles.items())) fn2index = dict((f, i) for (i, f) in enumerate(filenames)) terms, title_terms = self.get_terms(fn2index) objects = self.get_objects(fn2index) # populates _objtypes objtypes = dict((v, k[0] + ':' + k[1]) for (k, v) in iteritems(self._objtypes)) objnames = self._objnames return dict(filenames=filenames, titles=titles, terms=terms, objects=objects, objtypes=objtypes, objnames=objnames, titleterms=title_terms, envversion=self.env.version) def label(self): return "%s (code: %s)" % (self.lang.language_name, self.lang.lang) def prune(self, filenames): """Remove data for all filenames not in the list.""" new_titles = {} for filename in filenames: if filename in self._titles: new_titles[filename] = self._titles[filename] self._titles = new_titles for wordnames in itervalues(self._mapping): wordnames.intersection_update(filenames) for wordnames in itervalues(self._title_mapping): wordnames.intersection_update(filenames) def feed(self, filename, title, doctree): """Feed a doctree to the index.""" self._titles[filename] = title visitor = WordCollector(doctree, self.lang) doctree.walk(visitor) # memoize self.lang.stem def stem(word): try: return self._stem_cache[word] except KeyError: self._stem_cache[word] = self.lang.stem(word) return self._stem_cache[word] _filter = self.lang.word_filter for word in visitor.found_title_words: word = stem(word) if _filter(word): self._title_mapping.setdefault(word, set()).add(filename) for word in visitor.found_words: word = stem(word) if word not in self._title_mapping and _filter(word): self._mapping.setdefault(word, set()).add(filename) def context_for_searchtool(self): return dict( search_language_stemming_code = self.lang.js_stemmer_code, search_language_stop_words = jsdump.dumps(sorted(self.lang.stopwords)), search_scorer_tool = self.js_scorer_code, ) def get_js_stemmer_rawcode(self): if self.lang.js_stemmer_rawcode: return path.join( path.dirname(path.abspath(__file__)), 'non-minified-js', self.lang.js_stemmer_rawcode )
class IndexBuilder(object): """ Helper class that creates a searchindex based on the doctrees passed to the `feed` method. """ formats = { 'jsdump': jsdump, 'pickle': pickle } def __init__(self, env, lang, options, scoring): self.env = env # filename -> title self._titles = {} # stemmed word -> set(filenames) self._mapping = {} # stemmed words in titles -> set(filenames) self._title_mapping = {} # word -> stemmed word self._stem_cache = {} # objtype -> index self._objtypes = {} # objtype index -> (domain, type, objname (localized)) self._objnames = {} # add language-specific SearchLanguage instance lang_class = languages.get(lang) if lang_class is None: self.lang = SearchEnglish(options) elif isinstance(lang_class, str): module, classname = lang_class.rsplit('.', 1) lang_class = getattr(__import__(module, None, None, [classname]), classname) self.lang = lang_class(options) else: # it's directly a class (e.g. added by app.add_search_language) self.lang = lang_class(options) if scoring: with open(scoring, 'rb') as fp: self.js_scorer_code = fp.read().decode('utf-8') else: self.js_scorer_code = u'' def load(self, stream, format): """Reconstruct from frozen data.""" if isinstance(format, string_types): format = self.formats[format] frozen = format.load(stream) # if an old index is present, we treat it as not existing. if not isinstance(frozen, dict) or \ frozen.get('envversion') != self.env.version: raise ValueError('old format') index2fn = frozen['filenames'] self._titles = dict(zip(index2fn, frozen['titles'])) def load_terms(mapping): rv = {} for k, v in iteritems(mapping): if isinstance(v, int): rv[k] = set([index2fn[v]]) else: rv[k] = set(index2fn[i] for i in v) return rv self._mapping = load_terms(frozen['terms']) self._title_mapping = load_terms(frozen['titleterms']) # no need to load keywords/objtypes def dump(self, stream, format): """Dump the frozen index to a stream.""" if isinstance(format, string_types): format = self.formats[format] format.dump(self.freeze(), stream) def get_objects(self, fn2index): rv = {} otypes = self._objtypes onames = self._objnames for domainname, domain in iteritems(self.env.domains): for fullname, dispname, type, docname, anchor, prio in \ domain.get_objects(): # XXX use dispname? if docname not in fn2index: continue if prio < 0: continue prefix, name = rpartition(fullname, '.') pdict = rv.setdefault(prefix, {}) try: typeindex = otypes[domainname, type] except KeyError: typeindex = len(otypes) otypes[domainname, type] = typeindex otype = domain.object_types.get(type) if otype: # use unicode() to fire translation proxies onames[typeindex] = (domainname, type, text_type(domain.get_type_name(otype))) else: onames[typeindex] = (domainname, type, type) if anchor == fullname: shortanchor = '' elif anchor == type + '-' + fullname: shortanchor = '-' else: shortanchor = anchor pdict[name] = (fn2index[docname], typeindex, prio, shortanchor) return rv def get_terms(self, fn2index): rvs = {}, {} for rv, mapping in zip(rvs, (self._mapping, self._title_mapping)): for k, v in iteritems(mapping): if len(v) == 1: fn, = v if fn in fn2index: rv[k] = fn2index[fn] else: rv[k] = [fn2index[fn] for fn in v if fn in fn2index] return rvs def freeze(self): """Create a usable data structure for serializing.""" filenames = list(self._titles.keys()) titles = list(self._titles.values()) fn2index = dict((f, i) for (i, f) in enumerate(filenames)) terms, title_terms = self.get_terms(fn2index) objects = self.get_objects(fn2index) # populates _objtypes objtypes = dict((v, k[0] + ':' + k[1]) for (k, v) in iteritems(self._objtypes)) objnames = self._objnames return dict(filenames=filenames, titles=titles, terms=terms, objects=objects, objtypes=objtypes, objnames=objnames, titleterms=title_terms, envversion=self.env.version) def label(self): return "%s (code: %s)" % (self.lang.language_name, self.lang.lang) def prune(self, filenames): """Remove data for all filenames not in the list.""" new_titles = {} for filename in filenames: if filename in self._titles: new_titles[filename] = self._titles[filename] self._titles = new_titles for wordnames in itervalues(self._mapping): wordnames.intersection_update(filenames) for wordnames in itervalues(self._title_mapping): wordnames.intersection_update(filenames) def feed(self, filename, title, doctree): """Feed a doctree to the index.""" self._titles[filename] = title visitor = WordCollector(doctree, self.lang) doctree.walk(visitor) # memoize self.lang.stem def stem(word): try: return self._stem_cache[word] except KeyError: self._stem_cache[word] = self.lang.stem(word) return self._stem_cache[word] _filter = self.lang.word_filter for word in visitor.found_title_words: word = stem(word) if _filter(word): self._title_mapping.setdefault(word, set()).add(filename) for word in visitor.found_words: word = stem(word) if word not in self._title_mapping and _filter(word): self._mapping.setdefault(word, set()).add(filename) def context_for_searchtool(self): return dict( search_language_stemming_code = self.lang.js_stemmer_code, search_language_stop_words = jsdump.dumps(sorted(self.lang.stopwords)), search_scorer_tool = self.js_scorer_code, )