def test_str_combines_document_ref_and_field_name(self): field_name = "title" document_ref = 123 field_ref = FieldRef(document_ref, field_name) assert str(field_ref) == "title/123" assert repr(field_ref) == '<FieldRef field="title" ref="123">'
def add(self, doc, attributes=None): """Adds a document to the index. Before adding documents to the index it should have been fully setup, with the document ref and all fields to index already having been specified. The document must have a field name as specified by the ref (by default this is 'id') and it should have all fields defined for indexing, though None values will not cause errors. Args: - doc (dict): The document to be added to the index. - attributes (dict, optional): A set of attributes corresponding to the document, currently a single `boost` -> int will be taken into account. """ doc_ref = str(doc[self._ref]) self._documents[doc_ref] = attributes or {} self.document_count += 1 for field_name, field in self._fields.items(): extractor = field.extractor field_value = doc[field_name] if extractor is None else extractor( doc) tokens = Tokenizer(field_value) terms = self.pipeline.run(tokens) field_ref = FieldRef(doc_ref, field_name) field_terms = defaultdict(int) # TODO: field_refs are casted to strings in JS, should we allow # FieldRef as keys? self.field_term_frequencies[str(field_ref)] = field_terms self.field_lengths[str(field_ref)] = len(terms) for term in terms: # TODO: term is a Token, should we allow Tokens as keys? term_key = str(term) field_terms[term_key] += 1 if term_key not in self.inverted_index: posting = {_field_name: {} for _field_name in self._fields} posting["_index"] = self.term_index self.term_index += 1 self.inverted_index[term_key] = posting if doc_ref not in self.inverted_index[term_key][field_name]: self.inverted_index[term_key][field_name][ doc_ref] = defaultdict(list) for metadata_key in self.metadata_whitelist: metadata = term.metadata[metadata_key] self.inverted_index[term_key][field_name][doc_ref][ metadata_key].append(metadata)
def _calculate_average_field_lengths(self): """Calculates the average document length for this index""" accumulator = defaultdict(int) documents_with_field = defaultdict(int) for field_ref, length in self.field_lengths.items(): _field_ref = FieldRef.from_string(field_ref) field = _field_ref.field_name documents_with_field[field] += 1 accumulator[field] += length for field_name in self._fields: accumulator[field_name] /= documents_with_field[field_name] self.average_field_length = accumulator
def _create_field_vectors(self): """Builds a vector space model of every document using lunr.Vector.""" field_vectors = {} term_idf_cache = {} for field_ref, term_frequencies in self.field_term_frequencies.items(): _field_ref = FieldRef.from_string(field_ref) field_name = _field_ref.field_name field_length = self.field_lengths[field_ref] field_vector = Vector() field_boost = self._fields[field_name].boost doc_boost = self._documents[_field_ref.doc_ref].get("boost", 1) for term, tf in term_frequencies.items(): term_index = self.inverted_index[term]["_index"] if term not in term_idf_cache: idf = Idf(self.inverted_index[term], self.document_count) term_idf_cache[term] = idf else: idf = term_idf_cache[term] score = ( idf * ((self._k1 + 1) * tf) / (self._k1 * (1 - self._b + self._b * (field_length / self.average_field_length[field_name])) + tf)) score *= field_boost score *= doc_boost score_with_precision = round(score, 3) field_vector.insert(term_index, score_with_precision) field_vectors[field_ref] = field_vector self.field_vectors = field_vectors
def test_from_string_does_not_contain_join_character(self): string = "docRefOnly" with pytest.raises(BaseLunrException): FieldRef.from_string(string)
def test_from_string_docref_contains_join_character(self): field_ref = FieldRef.from_string("title/http://example.com/123") assert field_ref.field_name == "title" assert field_ref.doc_ref == "http://example.com/123"
def test_from_string_splits_string_into_parts(self): field_ref = FieldRef.from_string("title/123") assert field_ref.field_name == "title" assert field_ref.doc_ref == "123"
def query(self, query=None, callback=None): """Performs a query against the index using the passed lunr.Query object. If performing programmatic queries against the index, this method is preferred over `lunr.Index.search` so as to avoid the additional query parsing overhead. Args: query (lunr.Query): A configured Query to perform the search against, use `create_query` to get a preconfigured object or use `callback` for convenience. callback (callable): An optional function taking a single Query object result of `create_query` for further configuration. """ if query is None: query = self.create_query() if callback is not None: callback(query) if len(query.clauses) == 0: logger.warning( "Attempting a query with no clauses. Please add clauses by " "either using the `callback` argument or using `create_query` " "to create a preconfigured Query, manually adding clauses and " "passing it as the `query` argument.") return [] # for each query clause # * process terms # * expand terms from token set # * find matching documents and metadata # * get document vectors # * score documents matching_fields = {} query_vectors = {field: Vector() for field in self.fields} term_field_cache = {} required_matches = {} prohibited_matches = defaultdict(set) for clause in query.clauses: # Unless the pipeline has been disabled for this term, which is # the case for terms with wildcards, we need to pass the clause # term through the search pipeline. A pipeline returns an array # of processed terms. Pipeline functions may expand the passed # term, which means we may end up performing multiple index lookups # for a single query term. if clause.use_pipeline: terms = self.pipeline.run_string(clause.term, {"fields": clause.fields}) else: terms = [clause.term] clause_matches = set() for term in terms: # Each term returned from the pipeline needs to use the same # query clause object, e.g. the same boost and or edit distance # The simplest way to do this is to re-use the clause object # but mutate its term property. clause.term = term # From the term in the clause we create a token set which will # then be used to intersect the indexes token set to get a list # of terms to lookup in the inverted index term_token_set = TokenSet.from_clause(clause) expanded_terms = self.token_set.intersect( term_token_set).to_list() # If a term marked as required does not exist in the TokenSet # it is impossible for the search to return any matches. # We set all the field-scoped required matches set to empty # and stop examining further clauses if (len(expanded_terms) == 0 and clause.presence == QueryPresence.REQUIRED): for field in clause.fields: required_matches[field] = CompleteSet() break for expanded_term in expanded_terms: posting = self.inverted_index[expanded_term] term_index = posting["_index"] for field in clause.fields: # For each field that this query term is scoped by # (by default all fields are in scope) we need to get # all the document refs that have this term in that # field. # # The posting is the entry in the invertedIndex for the # matching term from above. field_posting = posting[field] matching_document_refs = field_posting.keys() term_field = expanded_term + "/" + field matching_documents_set = set(matching_document_refs) # If the presence of this term is required, ensure that # the matching documents are added to the set of # required matches for this clause. if clause.presence == QueryPresence.REQUIRED: clause_matches = clause_matches.union( matching_documents_set) if field not in required_matches: required_matches[field] = CompleteSet() # If the presence of this term is prohibited, # ensure that the matching documents are added to the # set of prohibited matches for this field, creating # that set if it does not exist yet. elif clause.presence == QueryPresence.PROHIBITED: prohibited_matches[field] = prohibited_matches[ field].union(matching_documents_set) # prohibited matches should not be part of the # query vector used for similarity scoring and no # metadata should be extracted so we continue # to the next field continue # The query field vector is populated using the # term_index found for the term an a unit value with # the appropriate boost # Using upsert because there could already be an entry # in the vector for the term we are working with. # In that case we just add the scores together. query_vectors[field].upsert(term_index, clause.boost, lambda a, b: a + b) # If we've already seen this term, field combo then # we've already collected the matching documents and # metadata, no need to go through all that again if term_field in term_field_cache: continue for matching_document_ref in matching_document_refs: # All metadata for this term/field/document triple # are then extracted and collected into an instance # of lunr.MatchData ready to be returned in the # query results matching_field_ref = FieldRef( matching_document_ref, field) metadata = field_posting[str( matching_document_ref)] if str(matching_field_ref) not in matching_fields: matching_fields[str( matching_field_ref)] = MatchData( expanded_term, field, metadata) else: matching_fields[str(matching_field_ref)].add( expanded_term, field, metadata) term_field_cache[term_field] = True # if the presence was required we need to update the required # matches field sets, we do this after all fields for the term # have collected their matches because the clause terms presence # is required in _any_ of the fields, not _all_ of the fields if clause.presence == QueryPresence.REQUIRED: for field in clause.fields: required_matches[field] = required_matches[ field].intersection(clause_matches) # We need to combine the field scoped required and prohibited # matching documents inot a global set of required and prohibited # matches all_required_matches = CompleteSet() all_prohibited_matches = set() for field in self.fields: if field in required_matches: all_required_matches = all_required_matches.intersection( required_matches[field]) if field in prohibited_matches: all_prohibited_matches = all_prohibited_matches.union( prohibited_matches[field]) matching_field_refs = matching_fields.keys() results = [] matches = {} # If the query is negated (only contains prohibited terms) # we need to get _all_ field_refs currently existing in the index. # This to avoid any costs of getting all field regs unnecessarily # Additionally, blank match data must be created to correctly populate # the results if query.is_negated(): matching_field_refs = list(self.field_vectors.keys()) for matching_field_ref in matching_field_refs: field_ref = FieldRef.from_string(matching_field_ref) matching_fields[matching_field_ref] = MatchData() for matching_field_ref in matching_field_refs: # Currently we have document fields that match the query, but we # need to return documents. The matchData and scores are combined # from multiple fields belonging to the same document. # # Scores are calculated by field, using the query vectors created # above, and combined into a final document score using addition. field_ref = FieldRef.from_string(matching_field_ref) doc_ref = field_ref.doc_ref if doc_ref not in all_required_matches or doc_ref in all_prohibited_matches: continue field_vector = self.field_vectors[matching_field_ref] score = query_vectors[field_ref.field_name].similarity( field_vector) try: doc_match = matches[doc_ref] doc_match["score"] += score doc_match["match_data"].combine( matching_fields[matching_field_ref]) except KeyError: match = { "ref": doc_ref, "score": score, "match_data": matching_fields[matching_field_ref], } matches[doc_ref] = match results.append(match) return sorted(results, key=lambda a: a["score"], reverse=True)