示例#1
0
    def test_str_combines_document_ref_and_field_name(self):
        field_name = "title"
        document_ref = 123
        field_ref = FieldRef(document_ref, field_name)

        assert str(field_ref) == "title/123"
        assert repr(field_ref) == '<FieldRef field="title" ref="123">'
示例#2
0
    def add(self, doc, attributes=None):
        """Adds a document to the index.

        Before adding documents to the index it should have been fully
        setup, with the document ref and all fields to index already having
        been specified.

        The document must have a field name as specified by the ref (by default
        this is 'id') and it should have all fields defined for indexing,
        though None values will not cause errors.

        Args:
            - doc (dict): The document to be added to the index.
            - attributes (dict, optional): A set of attributes corresponding
            to the document, currently a single `boost` -> int will be
            taken into account.
        """
        doc_ref = str(doc[self._ref])
        self._documents[doc_ref] = attributes or {}
        self.document_count += 1

        for field_name, field in self._fields.items():
            extractor = field.extractor
            field_value = doc[field_name] if extractor is None else extractor(
                doc)
            tokens = Tokenizer(field_value)
            terms = self.pipeline.run(tokens)
            field_ref = FieldRef(doc_ref, field_name)
            field_terms = defaultdict(int)

            # TODO: field_refs are casted to strings in JS, should we allow
            # FieldRef as keys?
            self.field_term_frequencies[str(field_ref)] = field_terms
            self.field_lengths[str(field_ref)] = len(terms)

            for term in terms:
                # TODO: term is a Token, should we allow Tokens as keys?
                term_key = str(term)

                field_terms[term_key] += 1
                if term_key not in self.inverted_index:
                    posting = {_field_name: {} for _field_name in self._fields}
                    posting["_index"] = self.term_index
                    self.term_index += 1
                    self.inverted_index[term_key] = posting

                if doc_ref not in self.inverted_index[term_key][field_name]:
                    self.inverted_index[term_key][field_name][
                        doc_ref] = defaultdict(list)

                for metadata_key in self.metadata_whitelist:
                    metadata = term.metadata[metadata_key]
                    self.inverted_index[term_key][field_name][doc_ref][
                        metadata_key].append(metadata)
示例#3
0
    def _calculate_average_field_lengths(self):
        """Calculates the average document length for this index"""
        accumulator = defaultdict(int)
        documents_with_field = defaultdict(int)

        for field_ref, length in self.field_lengths.items():
            _field_ref = FieldRef.from_string(field_ref)
            field = _field_ref.field_name

            documents_with_field[field] += 1
            accumulator[field] += length

        for field_name in self._fields:
            accumulator[field_name] /= documents_with_field[field_name]

        self.average_field_length = accumulator
示例#4
0
    def _create_field_vectors(self):
        """Builds a vector space model of every document using lunr.Vector."""
        field_vectors = {}
        term_idf_cache = {}

        for field_ref, term_frequencies in self.field_term_frequencies.items():
            _field_ref = FieldRef.from_string(field_ref)
            field_name = _field_ref.field_name
            field_length = self.field_lengths[field_ref]
            field_vector = Vector()
            field_boost = self._fields[field_name].boost
            doc_boost = self._documents[_field_ref.doc_ref].get("boost", 1)

            for term, tf in term_frequencies.items():
                term_index = self.inverted_index[term]["_index"]

                if term not in term_idf_cache:
                    idf = Idf(self.inverted_index[term], self.document_count)
                    term_idf_cache[term] = idf
                else:
                    idf = term_idf_cache[term]

                score = (
                    idf * ((self._k1 + 1) * tf) /
                    (self._k1 *
                     (1 - self._b + self._b *
                      (field_length / self.average_field_length[field_name])) +
                     tf))
                score *= field_boost
                score *= doc_boost
                score_with_precision = round(score, 3)

                field_vector.insert(term_index, score_with_precision)

            field_vectors[field_ref] = field_vector

        self.field_vectors = field_vectors
示例#5
0
    def test_from_string_does_not_contain_join_character(self):
        string = "docRefOnly"

        with pytest.raises(BaseLunrException):
            FieldRef.from_string(string)
示例#6
0
    def test_from_string_docref_contains_join_character(self):
        field_ref = FieldRef.from_string("title/http://example.com/123")

        assert field_ref.field_name == "title"
        assert field_ref.doc_ref == "http://example.com/123"
示例#7
0
    def test_from_string_splits_string_into_parts(self):
        field_ref = FieldRef.from_string("title/123")

        assert field_ref.field_name == "title"
        assert field_ref.doc_ref == "123"
示例#8
0
    def query(self, query=None, callback=None):
        """Performs a query against the index using the passed lunr.Query
        object.

        If performing programmatic queries against the index, this method is
        preferred over `lunr.Index.search` so as to avoid the additional query
        parsing overhead.

        Args:
            query (lunr.Query): A configured Query to perform the search
                against, use `create_query` to get a preconfigured object
                or use `callback` for convenience.
            callback (callable): An optional function taking a single Query
                object result of `create_query` for further configuration.
        """
        if query is None:
            query = self.create_query()

        if callback is not None:
            callback(query)

        if len(query.clauses) == 0:
            logger.warning(
                "Attempting a query with no clauses. Please add clauses by "
                "either using the `callback` argument or using `create_query` "
                "to create a preconfigured Query, manually adding clauses and "
                "passing it as the `query` argument.")
            return []

        # for each query clause
        # * process terms
        # * expand terms from token set
        # * find matching documents and metadata
        # * get document vectors
        # * score documents

        matching_fields = {}
        query_vectors = {field: Vector() for field in self.fields}
        term_field_cache = {}
        required_matches = {}
        prohibited_matches = defaultdict(set)

        for clause in query.clauses:
            # Unless the pipeline has been disabled for this term, which is
            # the case for terms with wildcards, we need to pass the clause
            # term through the search pipeline. A pipeline returns an array
            # of processed terms. Pipeline functions may expand the passed
            # term, which means we may end up performing multiple index lookups
            # for a single query term.
            if clause.use_pipeline:
                terms = self.pipeline.run_string(clause.term,
                                                 {"fields": clause.fields})
            else:
                terms = [clause.term]

            clause_matches = set()

            for term in terms:
                # Each term returned from the pipeline needs to use the same
                # query clause object, e.g. the same boost and or edit distance
                # The simplest way to do this is to re-use the clause object
                # but mutate its term property.
                clause.term = term

                # From the term in the clause we create a token set which will
                # then be used to intersect the indexes token set to get a list
                # of terms to lookup in the inverted index
                term_token_set = TokenSet.from_clause(clause)
                expanded_terms = self.token_set.intersect(
                    term_token_set).to_list()

                # If a term marked as required does not exist in the TokenSet
                # it is impossible for the search to return any matches.
                # We set all the field-scoped required matches set to empty
                # and stop examining further clauses
                if (len(expanded_terms) == 0
                        and clause.presence == QueryPresence.REQUIRED):
                    for field in clause.fields:
                        required_matches[field] = CompleteSet()
                    break

                for expanded_term in expanded_terms:
                    posting = self.inverted_index[expanded_term]
                    term_index = posting["_index"]

                    for field in clause.fields:
                        # For each field that this query term is scoped by
                        # (by default all fields are in scope) we need to get
                        # all the document refs that have this term in that
                        # field.
                        #
                        # The posting is the entry in the invertedIndex for the
                        # matching term from above.
                        field_posting = posting[field]
                        matching_document_refs = field_posting.keys()
                        term_field = expanded_term + "/" + field
                        matching_documents_set = set(matching_document_refs)

                        # If the presence of this term is required, ensure that
                        # the matching documents are added to the set of
                        # required matches for this clause.
                        if clause.presence == QueryPresence.REQUIRED:
                            clause_matches = clause_matches.union(
                                matching_documents_set)

                            if field not in required_matches:
                                required_matches[field] = CompleteSet()

                        # If the presence of this term is prohibited,
                        # ensure that the matching documents are added to the
                        # set of prohibited matches for this field, creating
                        # that set if it does not exist yet.
                        elif clause.presence == QueryPresence.PROHIBITED:
                            prohibited_matches[field] = prohibited_matches[
                                field].union(matching_documents_set)

                            # prohibited matches should not be part of the
                            # query vector used for similarity scoring and no
                            # metadata should be extracted so we continue
                            # to the next field
                            continue

                        # The query field vector is populated using the
                        # term_index found for the term an a unit value with
                        # the appropriate boost
                        # Using upsert because there could already be an entry
                        # in the vector for the term we are working with.
                        # In that case we just add the scores together.
                        query_vectors[field].upsert(term_index, clause.boost,
                                                    lambda a, b: a + b)

                        # If we've already seen this term, field combo then
                        # we've already collected the matching documents and
                        # metadata, no need to go through all that again
                        if term_field in term_field_cache:
                            continue

                        for matching_document_ref in matching_document_refs:
                            # All metadata for this term/field/document triple
                            # are then extracted and collected into an instance
                            # of lunr.MatchData ready to be returned in the
                            # query results
                            matching_field_ref = FieldRef(
                                matching_document_ref, field)
                            metadata = field_posting[str(
                                matching_document_ref)]

                            if str(matching_field_ref) not in matching_fields:
                                matching_fields[str(
                                    matching_field_ref)] = MatchData(
                                        expanded_term, field, metadata)
                            else:
                                matching_fields[str(matching_field_ref)].add(
                                    expanded_term, field, metadata)

                        term_field_cache[term_field] = True

            # if the presence was required we need to update the required
            # matches field sets, we do this after all fields for the term
            # have collected their matches because the clause terms presence
            # is required in _any_ of the fields, not _all_ of the fields
            if clause.presence == QueryPresence.REQUIRED:
                for field in clause.fields:
                    required_matches[field] = required_matches[
                        field].intersection(clause_matches)

        # We need to combine the field scoped required and prohibited
        # matching documents inot a global set of required and prohibited
        # matches
        all_required_matches = CompleteSet()
        all_prohibited_matches = set()
        for field in self.fields:
            if field in required_matches:
                all_required_matches = all_required_matches.intersection(
                    required_matches[field])
            if field in prohibited_matches:
                all_prohibited_matches = all_prohibited_matches.union(
                    prohibited_matches[field])

        matching_field_refs = matching_fields.keys()
        results = []
        matches = {}

        # If the query is negated (only contains prohibited terms)
        # we need to get _all_ field_refs currently existing in the index.
        # This to avoid any costs of getting all field regs unnecessarily
        # Additionally, blank match data must be created to correctly populate
        # the results
        if query.is_negated():
            matching_field_refs = list(self.field_vectors.keys())

            for matching_field_ref in matching_field_refs:
                field_ref = FieldRef.from_string(matching_field_ref)
                matching_fields[matching_field_ref] = MatchData()

        for matching_field_ref in matching_field_refs:
            # Currently we have document fields that match the query, but we
            # need to return documents. The matchData and scores are combined
            # from multiple fields belonging to the same document.
            #
            # Scores are calculated by field, using the query vectors created
            # above, and combined into a final document score using addition.
            field_ref = FieldRef.from_string(matching_field_ref)
            doc_ref = field_ref.doc_ref

            if doc_ref not in all_required_matches or doc_ref in all_prohibited_matches:
                continue

            field_vector = self.field_vectors[matching_field_ref]
            score = query_vectors[field_ref.field_name].similarity(
                field_vector)

            try:
                doc_match = matches[doc_ref]
                doc_match["score"] += score
                doc_match["match_data"].combine(
                    matching_fields[matching_field_ref])
            except KeyError:
                match = {
                    "ref": doc_ref,
                    "score": score,
                    "match_data": matching_fields[matching_field_ref],
                }
                matches[doc_ref] = match
                results.append(match)

        return sorted(results, key=lambda a: a["score"], reverse=True)