Пример #1
0
class TestBuilderBuild:
    def setup_method(self, method):
        self.builder = Builder()
        doc = {"id": "id", "title": "test", "body": "missing"}

        self.builder.ref("id")
        self.builder.field("title")
        self.builder.add(doc)
        self.index = self.builder.build()

    def test_adds_tokens_to_inverted_index(self):
        _assert_deep_keys(self.builder.inverted_index, "test.title.id")

    def test_builds_vector_space_of_the_document_fields(self):
        assert "title/id" in self.builder.field_vectors
        assert isinstance(self.builder.field_vectors["title/id"], Vector)

    def test_skips_fields_not_defined_for_indexing(self):
        assert "missing" not in self.builder.inverted_index

    def test_builds_a_token_set_for_the_corpus(self):
        needle = TokenSet.from_string("test")
        assert "test" in self.builder.token_set.intersect(needle).to_list()

    def test_calculates_document_count(self):
        assert self.builder.average_field_length["title"] == 1

    def test_index_is_returned(self):
        assert isinstance(self.index, Index)
Пример #2
0
def lunr(ref, fields, documents, languages=None):
    """A convenience function to configure and construct a lunr.Index.

    Args:
        ref (str): The key in the documents to be used a the reference.
        fields (list): A list of strings defining fields in the documents to
            index. Optionally a list of dictionaries with three keys:
            `field_name` defining the document's field, `boost` an integer
            defining a boost to be applied to the field, and `extractor`
            a callable taking the document as a single argument and returning
            a string located in the document in a particular way.
        documents (list): The list of dictonaries representing the documents
            to index. Optionally a 2-tuple of dicts, the first one being
            the document and the second the associated attributes to it.
        languages (str or list, optional): The languages to use if using
            NLTK language support, ignored if NLTK is not available.

    Returns:
        Index: The populated Index ready to search against.
    """
    if languages is not None and lang.LANGUAGE_SUPPORT:
        if isinstance(languages, basestring):
            languages = [languages]

        unsupported_languages = set(languages) - set(lang.SUPPORTED_LANGUAGES)
        if unsupported_languages:
            raise RuntimeError("The specified languages {} are not supported, "
                               "please choose one of {}".format(
                                   ", ".join(unsupported_languages),
                                   ", ".join(lang.SUPPORTED_LANGUAGES.keys()),
                               ))
        builder = lang.get_nltk_builder(languages)
    else:
        builder = Builder()
        builder.pipeline.add(trimmer, stop_word_filter)  #, stemmer)
        #builder.search_pipeline.add(stemmer)

    builder.ref(ref)
    for field in fields:
        if isinstance(field, dict):
            builder.field(**field)
        else:
            builder.field(field)

    for document in documents:
        if isinstance(document, (tuple, list)):
            builder.add(document[0], attributes=document[1])
        else:
            builder.add(document)

    return builder.build()
Пример #3
0
    def generate_output(self, writer):
        pages = [self.create_node(x) for x in self.context['articles']]
        path = os.path.join(self.output_path, 'search_index.json')

        pages_to_index = [{
            'id': x['id'],
            'title': x['title'],
            'text': x['text']
        } for x in pages]

        additional_data = {
            x['id']: {
                'url': x['url'],
                'title': x['title'],
                'summary': x['summary'],
            }
            for x in pages
        }

        Pipeline.register_function(special_chars_remover,
                                   'specialCharsRemover')

        bldr = Builder()
        bldr.pipeline.add(trimmer, stop_word_filter, stemmer,
                          special_chars_remover)
        bldr.search_pipeline.add(stemmer)
        bldr.ref('id')
        bldr.field('title', 10)
        bldr.field('text')

        for page in pages_to_index:
            bldr.add(page)
        idx = bldr.build().serialize()

        with open(path, 'w') as idxfile:
            json.dump({
                'index': idx,
                'data': additional_data,
            }, idxfile)