def test_register_function_warns_when_adding_function_with_same_label( self): Pipeline.register_function(self.fn, "fn") with patch("lunr.pipeline.log") as mock_log: Pipeline.register_function(self.fn, "fn") mock_log.warning.assert_called_once()
def register_languages(): """Register all supported languages to ensure compatibility.""" for language in set(SUPPORTED_LANGUAGES) - {"en"}: language_stemmer = partial(nltk_stemmer, get_language_stemmer(language)) Pipeline.register_function(language_stemmer, "stemmer-{}".format(language))
def test_load_with_registered_functions(self): serialized_pipeline = ["fn"] Pipeline.register_function(fn, "fn") pipeline = Pipeline.load(serialized_pipeline) assert len(pipeline) == 1 assert pipeline._stack[0] == fn
def generate_stop_word_filter(stop_words, language=None): """Builds a stopWordFilter function from the provided list of stop words. The built in `stop_word_filter` is built using this factory and can be used to generate custom `stop_word_filter` for applications or non English languages. """ def stop_word_filter(token, i=None, tokens=None): if token and str(token) not in stop_words: return token # camelCased for for compatibility with lunr.js label = ("stopWordFilter-{}".format(language) if language is not None else "stopWordFilter") Pipeline.register_function(stop_word_filter, label) return stop_word_filter
def test_add_token_metadata(): builder = get_default_builder() def token_length(token, i, tokens): token.metadata["token_length"] = len(str(token)) return token Pipeline.register_function(token_length) builder.pipeline.add(token_length) builder.metadata_whitelist.append("token_length") idx = lunr("id", ("title", "body"), documents, builder=builder) [result, _, _] = idx.search("green") assert result["match_data"].metadata["green"]["title"]["token_length"] == [ 5 ] assert result["match_data"].metadata["green"]["body"]["token_length"] == [ 5, 5 ]
def get_nltk_builder(languages): """Returns a builder with stemmers for all languages added to it. Args: languages (list): A list of supported languages. """ #all_stemmers = [] all_stopwords_filters = [] all_word_characters = set() for language in languages: if language == "en": # use Lunr's defaults #all_stemmers.append(lunr.stemmer.stemmer) all_stopwords_filters.append(stop_word_filter) all_word_characters.update({r"\w"}) else: stopwords, word_characters = _get_stopwords_and_word_characters(language) #all_stemmers.append( # Pipeline.registered_functions["stemmer-{}".format(language)] #) all_stopwords_filters.append( generate_stop_word_filter(stopwords, language=language) ) all_word_characters.update(word_characters) builder = Builder() multi_trimmer = generate_trimmer("".join(sorted(all_word_characters))) Pipeline.register_function( multi_trimmer, "lunr-multi-trimmer-{}".format("-".join(languages)) ) builder.pipeline.reset() for fn in chain([multi_trimmer], all_stopwords_filters):#, all_stemmers): builder.pipeline.add(fn) #for fn in all_stemmers: # builder.search_pipeline.add(fn) return builder
def generate_output(self, writer): pages = [self.create_node(x) for x in self.context['articles']] path = os.path.join(self.output_path, 'search_index.json') pages_to_index = [{ 'id': x['id'], 'title': x['title'], 'text': x['text'] } for x in pages] additional_data = { x['id']: { 'url': x['url'], 'title': x['title'], 'summary': x['summary'], } for x in pages } Pipeline.register_function(special_chars_remover, 'specialCharsRemover') bldr = Builder() bldr.pipeline.add(trimmer, stop_word_filter, stemmer, special_chars_remover) bldr.search_pipeline.add(stemmer) bldr.ref('id') bldr.field('title', 10) bldr.field('text') for page in pages_to_index: bldr.add(page) idx = bldr.build().serialize() with open(path, 'w') as idxfile: json.dump({ 'index': idx, 'data': additional_data, }, idxfile)
# With this line, strings of length 1 or 2 don't go through the # stemming process, although no mention is made of this in the # published algorithm. Remove the line to match the published # algorithm. self.step1ab() self.step1c() self.step2() self.step3() self.step4() self.step5() return self.b[self.k0:self.k + 1] porter_stemmer = PorterStemmer() def stemmer(token, i=None, tokens=None): """Wrapper around the PorterStemmer for inclusion in pipeline. Args: language (str): ISO-639-1 code of the language. token (lunr.Token): The token to stem. i (int): The index of the token in a set. tokens (list): A list of tokens representing the set. """ return token.update(porter_stemmer.stem) Pipeline.register_function(stemmer, "stemmer")
import re from lunr.pipeline import Pipeline full_re = re.compile(r"^\W*?([^\W]+)\W*?$") def trimmer(token, i=None, tokens=None): def trim(s, metadata=None): match = full_re.match(s) if match is None: return s return match.group(1) return token.update(trim) Pipeline.register_function(trimmer, "trimmer")
def test_register_function_adds_function_to_list_of_registered_functions( self): Pipeline.register_function(self.fn, "fn") assert Pipeline.registered_functions["fn"] == self.fn
def test_register_function_adds_a_label_property_to_the_function(self): Pipeline.register_function(self.fn, "fn") assert self.fn.label == "fn"
def test_serialize_returns_array_of_registered_function_labels(self): Pipeline.register_function(fn, "fn") self.pipeline.add(fn) assert self.pipeline.serialize() == ["fn"] assert repr(self.pipeline) == '<Pipeline stack="fn">'
def test_register_function_adds_defaults_to_name_of_the_function(self): Pipeline.register_function(self.fn) assert self.fn.label == self.fn.__name__