class TestBuilderBuild: def setup_method(self, method): self.builder = Builder() doc = {"id": "id", "title": "test", "body": "missing"} self.builder.ref("id") self.builder.field("title") self.builder.add(doc) self.index = self.builder.build() def test_adds_tokens_to_inverted_index(self): _assert_deep_keys(self.builder.inverted_index, "test.title.id") def test_builds_vector_space_of_the_document_fields(self): assert "title/id" in self.builder.field_vectors assert isinstance(self.builder.field_vectors["title/id"], Vector) def test_skips_fields_not_defined_for_indexing(self): assert "missing" not in self.builder.inverted_index def test_builds_a_token_set_for_the_corpus(self): needle = TokenSet.from_string("test") assert "test" in self.builder.token_set.intersect(needle).to_list() def test_calculates_document_count(self): assert self.builder.average_field_length["title"] == 1 def test_index_is_returned(self): assert isinstance(self.index, Index)
def lunr(ref, fields, documents, languages=None): """A convenience function to configure and construct a lunr.Index. Args: ref (str): The key in the documents to be used a the reference. fields (list): A list of strings defining fields in the documents to index. Optionally a list of dictionaries with three keys: `field_name` defining the document's field, `boost` an integer defining a boost to be applied to the field, and `extractor` a callable taking the document as a single argument and returning a string located in the document in a particular way. documents (list): The list of dictonaries representing the documents to index. Optionally a 2-tuple of dicts, the first one being the document and the second the associated attributes to it. languages (str or list, optional): The languages to use if using NLTK language support, ignored if NLTK is not available. Returns: Index: The populated Index ready to search against. """ if languages is not None and lang.LANGUAGE_SUPPORT: if isinstance(languages, basestring): languages = [languages] unsupported_languages = set(languages) - set(lang.SUPPORTED_LANGUAGES) if unsupported_languages: raise RuntimeError("The specified languages {} are not supported, " "please choose one of {}".format( ", ".join(unsupported_languages), ", ".join(lang.SUPPORTED_LANGUAGES.keys()), )) builder = lang.get_nltk_builder(languages) else: builder = Builder() builder.pipeline.add(trimmer, stop_word_filter) #, stemmer) #builder.search_pipeline.add(stemmer) builder.ref(ref) for field in fields: if isinstance(field, dict): builder.field(**field) else: builder.field(field) for document in documents: if isinstance(document, (tuple, list)): builder.add(document[0], attributes=document[1]) else: builder.add(document) return builder.build()
def generate_output(self, writer): pages = [self.create_node(x) for x in self.context['articles']] path = os.path.join(self.output_path, 'search_index.json') pages_to_index = [{ 'id': x['id'], 'title': x['title'], 'text': x['text'] } for x in pages] additional_data = { x['id']: { 'url': x['url'], 'title': x['title'], 'summary': x['summary'], } for x in pages } Pipeline.register_function(special_chars_remover, 'specialCharsRemover') bldr = Builder() bldr.pipeline.add(trimmer, stop_word_filter, stemmer, special_chars_remover) bldr.search_pipeline.add(stemmer) bldr.ref('id') bldr.field('title', 10) bldr.field('text') for page in pages_to_index: bldr.add(page) idx = bldr.build().serialize() with open(path, 'w') as idxfile: json.dump({ 'index': idx, 'data': additional_data, }, idxfile)