def test_span_characters(): ix = get_index() with ix.searcher() as s: pq = Phrase("text", ["bravo", "echo"]) m = pq.matcher(s) while m.is_active(): orig = " ".join(s.stored_fields(m.id())["text"]) for span in m.spans(): startchar, endchar = span.startchar, span.endchar assert orig[startchar:endchar] == "bravo echo" m.next()
def test_boost_phrase(): schema = fields.Schema(title=fields.TEXT(field_boost=5.0, stored=True), text=fields.TEXT) ix = RamStorage().create_index(schema) domain = u("alfa bravo charlie delta").split() w = ix.writer() for ls in permutations(domain): t = u(" ").join(ls) w.add_document(title=t, text=t) w.commit() q = Or([Term("title", u("alfa")), Term("title", u("bravo")), Phrase("text", [u("bravo"), u("charlie"), u("delta")])]) def boost_phrases(q): if isinstance(q, Phrase): q.boost *= 1000.0 return q else: return q.apply(boost_phrases) q = boost_phrases(q) with ix.searcher() as s: r = s.search(q, limit=None) for hit in r: if "bravo charlie delta" in hit["title"]: assert hit.score > 100.0
def search(self, q: str): res = None with self.ix.searcher() as searcher: query = Phrase(self.search_index, self.process_query(q)) #query = QueryParser(self.search_index, self.ix.schema).parse(self.process_query(q)) results = searcher.search(query) res = list(results) res = [item.fields() for item in res] return res
def test_phrase_score(): schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() writer.add_document(name=u("A"), value=u("Little Miss Muffet sat on a tuffet")) writer.add_document(name=u("D"), value=u("Gibberish blonk falunk miss muffet sat tuffet garbonzo")) writer.add_document(name=u("E"), value=u("Blah blah blah pancakes")) writer.add_document(name=u("F"), value=u("Little miss muffet little miss muffet")) writer.commit() with ix.searcher() as s: q = Phrase("value", [u("little"), u("miss"), u("muffet")]) m = q.matcher(s) assert_equal(m.id(), 0) score1 = m.weight() assert score1 > 0 m.next() assert_equal(m.id(), 3) assert m.weight() > score1
def re_weight_query(self, query, terms): print(terms) weights = [ 1 - (0.9 * ((i + 1) / len(terms))) for i, t in enumerate(terms) ] expanded_query = query.with_boost(1) for i, w in enumerate(weights): tokens = [i.text for i in self.query_analyzer(terms[i])] print(terms[i], tokens) q = Phrase('text', tokens, 3).with_boost(w) expanded_query = expanded_query | q return expanded_query
def test_accept(): def boost_phrases(q): if isinstance(q, Phrase): q.boost *= 2.0 return q before = And([ Term("a", u("b")), Or([Term("c", u("d")), Phrase("a", [u("e"), u("f")])]), Phrase("a", [u("g"), u("h")], boost=0.25) ]) after = before.accept(boost_phrases) assert_equal( after, And([ Term("a", u("b")), Or([Term("c", u("d")), Phrase("a", [u("e"), u("f")], boost=2.0)]), Phrase("a", [u("g"), u("h")], boost=0.5) ])) before = Phrase("a", [u("b"), u("c")], boost=2.5) after = before.accept(boost_phrases) assert_equal(after, Phrase("a", [u("b"), u("c")], boost=5.0))
def test_phrase_sameword(): schema = fields.Schema(id=fields.STORED, text=fields.TEXT) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() writer.add_document(id=1, text=u("The film Linda Linda Linda is good")) writer.add_document(id=2, text=u("The model Linda Evangelista is pretty")) writer.commit() with ix.searcher() as s: r = s.search(Phrase("text", ["linda", "linda", "linda"]), limit=None) assert_equal(len(r), 1) assert_equal(r[0]["id"], 1)
def test_phrase_order(): tfield = fields.TEXT(stored=True, analyzer=analysis.SimpleAnalyzer()) schema = fields.Schema(text=tfield) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() for ls in permutations(["ape", "bay", "can", "day"], 4): writer.add_document(text=u(" ").join(ls)) writer.commit() with ix.searcher() as s: def result(q): r = s.search(q, limit=None, sortedby=None) return sorted([d['text'] for d in r]) q = Phrase("text", ["bay", "can", "day"]) assert_equal(result(q), [u('ape bay can day'), u('bay can day ape')])
def test_phrase_multi(): schema = fields.Schema(id=fields.STORED, text=fields.TEXT) ix = RamStorage().create_index(schema) domain = u("alfa bravo charlie delta echo").split() w = None for i, ls in enumerate(permutations(domain)): if w is None: w = ix.writer() w.add_document(id=i, text=u(" ").join(ls)) if not i % 30: w.commit() w = None if w is not None: w.commit() with ix.searcher() as s: q = Phrase("text", ["alfa", "bravo"]) _ = s.search(q)
def __build_search_query(self, keywords): keyword_list_no_stopw = kwlist_no_stopwords(keywords) keyword_list_stemmed = kwlist_stemmed(keywords) # build the terms to search fields_to_search = ['fieldname', 'fieldname_processed_parents', 'fieldname_processed_current', 'fieldname_current', 'title'] all_fields_and_terms = [Term(f, kw) for kw in keyword_list_no_stopw for f in fields_to_search] all_fields_and_terms.extend([Term('title_stemmed', kw) for kw in keyword_list_stemmed]) if keyword_list_no_stopw and len(keyword_list_no_stopw) > 1: # noinspection PyTypeChecker all_fields_and_terms.append(Phrase('title', keyword_list_no_stopw, boost=2, slop=1)) return all_fields_and_terms
def __build_search_query(cls, keywords): """ prepares the search query over IR engine """ keyword_list_no_stopw = kwlist_no_stopwords(keywords) keyword_list_stemmed = kwlist_stemmed(keywords) fields_to_search = ['fieldname', 'fieldname_processed_parents', 'fieldname_processed_current'] all_fields_and_terms = [Term(f, kw) for kw in keyword_list_stemmed for f in fields_to_search] all_fields_and_terms.extend([Term('title_stemmed', kw) for kw in keyword_list_stemmed]) # Note: WE no not search by stopwords.... # here phrase also uses title_stemmed, it will match title stemmed, # so score boost shouldn't be too high if keyword_list_no_stopw and len(keyword_list_no_stopw) > 1: # noinspection PyTypeChecker all_fields_and_terms.append( Phrase('title_stemmed', keyword_list_stemmed, boost=1.5, slop=1)) return all_fields_and_terms
def test_query_copy_hash(): def do(q1, q2): q1a = copy.deepcopy(q1) assert_equal(q1, q1a) assert_equal(hash(q1), hash(q1a)) assert_not_equal(q1, q2) do(Term("a", u("b"), boost=1.1), Term("a", u("b"), boost=1.5)) do(And([Term("a", u("b")), Term("c", u("d"))], boost=1.1), And([Term("a", u("b")), Term("c", u("d"))], boost=1.5)) do(Or([Term("a", u("b"), boost=1.1), Term("c", u("d"))]), Or([Term("a", u("b"), boost=1.8), Term("c", u("d"))], boost=1.5)) do(DisjunctionMax([Term("a", u("b"), boost=1.8), Term("c", u("d"))]), DisjunctionMax([Term("a", u("b"), boost=1.1), Term("c", u("d"))], boost=1.5)) do(Not(Term("a", u("b"), boost=1.1)), Not(Term("a", u("b"), boost=1.5))) do(Prefix("a", u("b"), boost=1.1), Prefix("a", u("b"), boost=1.5)) do(Wildcard("a", u("b*x?"), boost=1.1), Wildcard("a", u("b*x?"), boost=1.5)) do(FuzzyTerm("a", u("b"), constantscore=True), FuzzyTerm("a", u("b"), constantscore=False)) do(FuzzyTerm("a", u("b"), boost=1.1), FuzzyTerm("a", u("b"), boost=1.5)) do(TermRange("a", u("b"), u("c")), TermRange("a", u("b"), u("d"))) do(TermRange("a", None, u("c")), TermRange("a", None, None)) do(TermRange("a", u("b"), u("c"), boost=1.1), TermRange("a", u("b"), u("c"), boost=1.5)) do(TermRange("a", u("b"), u("c"), constantscore=True), TermRange("a", u("b"), u("c"), constantscore=False)) do(NumericRange("a", 1, 5), NumericRange("a", 1, 6)) do(NumericRange("a", None, 5), NumericRange("a", None, None)) do(NumericRange("a", 3, 6, boost=1.1), NumericRange("a", 3, 6, boost=1.5)) do(NumericRange("a", 3, 6, constantscore=True), NumericRange("a", 3, 6, constantscore=False)) # do(DateRange) do(Variations("a", u("render")), Variations("a", u("renders"))) do(Variations("a", u("render"), boost=1.1), Variations("a", u("renders"), boost=1.5)) do(Phrase("a", [u("b"), u("c"), u("d")]), Phrase("a", [u("b"), u("c"), u("e")])) do(Phrase("a", [u("b"), u("c"), u("d")], boost=1.1), Phrase("a", [u("b"), u("c"), u("d")], boost=1.5)) do(Phrase("a", [u("b"), u("c"), u("d")], slop=1), Phrase("a", [u("b"), u("c"), u("d")], slop=2)) # do(Ordered) do(Every(), Every("a")) do(Every("a"), Every("b")) do(Every("a", boost=1.1), Every("a", boost=1.5)) do(NullQuery, Term("a", u("b"))) do(ConstantScoreQuery(Term("a", u("b"))), ConstantScoreQuery(Term("a", u("c")))) do(ConstantScoreQuery(Term("a", u("b")), score=2.0), ConstantScoreQuery(Term("a", u("c")), score=2.1)) do(Require(Term("a", u("b")), Term("c", u("d"))), Require(Term("a", u("b"), boost=1.1), Term("c", u("d")))) # do(Require) # do(AndMaybe) # do(AndNot) # do(Otherwise) do(SpanFirst(Term("a", u("b")), limit=1), SpanFirst(Term("a", u("b")), limit=2)) do(SpanNear(Term("a", u("b")), Term("c", u("d"))), SpanNear(Term("a", u("b")), Term("c", u("e")))) do(SpanNear(Term("a", u("b")), Term("c", u("d")), slop=1), SpanNear(Term("a", u("b")), Term("c", u("d")), slop=2)) do(SpanNear(Term("a", u("b")), Term("c", u("d")), mindist=1), SpanNear(Term("a", u("b")), Term("c", u("d")), mindist=2)) do(SpanNear(Term("a", u("b")), Term("c", u("d")), ordered=True), SpanNear(Term("a", u("b")), Term("c", u("d")), ordered=False)) do(SpanNot(Term("a", u("b")), Term("a", u("c"))), SpanNot(Term("a", u("b")), Term("a", u("d")))) do(SpanOr([Term("a", u("b")), Term("a", u("c")), Term("a", u("d"))]), SpanOr([Term("a", u("b")), Term("a", u("c")), Term("a", u("e"))])) do(SpanContains(Term("a", u("b")), Term("a", u("c"))), SpanContains(Term("a", u("b")), Term("a", u("d"))))
def test_posting_phrase(): schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() writer.add_document(name=u("A"), value=u("Little Miss Muffet sat on a tuffet")) writer.add_document(name=u("B"), value=u("Miss Little Muffet tuffet")) writer.add_document(name=u("C"), value=u("Miss Little Muffet tuffet sat")) writer.add_document(name=u("D"), value=u("Gibberish blonk falunk miss muffet sat tuffet garbonzo")) writer.add_document(name=u("E"), value=u("Blah blah blah pancakes")) writer.commit() with ix.searcher() as s: def names(results): return sorted([fields['name'] for fields in results]) q = Phrase("value", [u("little"), u("miss"), u("muffet"), u("sat"), u("tuffet")]) m = q.matcher(s) assert_equal(m.__class__.__name__, "SpanNearMatcher") r = s.search(q) assert_equal(names(r), ["A"]) assert_equal(len(r), 1) q = Phrase("value", [u("miss"), u("muffet"), u("sat"), u("tuffet")]) assert_equal(names(s.search(q)), ["A", "D"]) q = Phrase("value", [u("falunk"), u("gibberish")]) r = s.search(q) assert_equal(names(r), []) assert_equal(len(r), 0) q = Phrase("value", [u("gibberish"), u("falunk")], slop=2) assert_equal(names(s.search(q)), ["D"]) q = Phrase("value", [u("blah")] * 4) assert_equal(names(s.search(q)), []) # blah blah blah blah q = Phrase("value", [u("blah")] * 3) m = q.matcher(s) assert_equal(names(s.search(q)), ["E"])
''' Created on Oct 27, 2014 @author: Cassie ''' from whoosh import index from whoosh.qparser import QueryParser from whoosh.query import And, Term, Not, FuzzyTerm, Phrase ix = index.open_dir("index") q1 = And([ Term("city_text", u"greek"), Term("city_text", u"roman"), Not(Term("city_text", u"persian")) ]) q2 = FuzzyTerm("city_text", u"shakespeare") q3 = Phrase("city_text", [u"located", u"below", u"sea", u"level"], slop=10) with ix.searcher() as s: results = s.search(q2, limit=None) for a in results: print a['city_name']
def search(self, text, configuration): # Default query object t = [i.text for i in self.query_analyzer(text)] query = self.parser.parse(text) if len(t) <= 1 else Phrase( 'text', t, slop=1) logger.info(repr(query)) logger.info(repr(nltk.pos_tag(nltk.word_tokenize(text)))) # print(extract(word_tokenize(text))) # Default results object results = [] # Default query limit limit = 10 if 'results_limit' in configuration and int( configuration['results_limit']) > 0: limit = int(configuration['results_limit']) # Default Query Expansion expansion = 'lca' expansion_threshold = 1.4 expansion_terms = self._query_expansion_terms if 'query_expansion' in configuration: expansion = configuration['query_expansion'] # Default Ranking model = MODELS['bm25'] searcher = self.searcher['bm25'] if 'ranking' in configuration and configuration['ranking'] and MODELS[ configuration['ranking']]: model = MODELS[configuration['ranking']] searcher = self.searcher[configuration['ranking']] # Default Link Analysis link_analysis = False facet = lambda result: result.score if 'link_analysis' in configuration and configuration[ 'link_analysis'] != 'none': link_analysis = configuration['link_analysis'] if link_analysis == 'hits_rank': results = searcher.search( query, limit=self._hits_rank_relevant_window) auths, hubs = self.hitsrank.rank_from_results(results) facet = hits_rank_facet(auths, hubs) if link_analysis == 'page_rank': facet = page_rank_facet(self.pagerank) try: results = [] print('* limit ', limit) print('* model: ', model.__name__) print('* expansion model: ', expansion) print('* link analysis: ', link_analysis) if expansion != 'none' and expansion is not False: if expansion == 'lca': # if link_analysis: # results = searcher.search(query, limit=self._page_rank_relevant_window) # results = sorted(results, key=facet, reverse=True)[:self._query_expansion_relevant_limit] # expansion_threshold = 1.005 # else: results = searcher.search( query, limit=self._query_expansion_relevant_limit) if len(results) >= self._query_expansion_relevant_limit: terms = lca_expand(query, results, size=20, threshold=1.288) # print(terms) expanded_query = self.re_weight_query(query, terms) logging.info(repr(expanded_query)) results = searcher.search(expanded_query, limit=limit) elif expansion == 'thesaurus': terms = thesaurus_expand( text, self.wikimedia, size=10, threshold=self.query_expansion_thesaurus_threshold) # print(terms) expanded_query = self.re_weight_query(query, terms) logging.info(repr(expanded_query)) results = searcher.search(expanded_query, limit=limit) else: results = searcher.search(query, limit=limit) results = sorted(results, key=facet, reverse=True) return [r.Result(i, query) for i in results] except Exception as e: logger.error(e) return []
with codecs.open("tom_jones.txt", "w", "utf-8") as tom_jones_out: tom_jones_out.write(html) # 6. Search the index: with codecs.open("tom_jones.txt", "r", "utf-8") as tom_jones_in: tom_jones_trigrams = ngrams( tom_jones_in.read().replace("\r", "").replace("\n", " ").split( "PROJECT GUTENBERG")[1].split("PROJECT GUTENBERG")[0].split(), 3) with codecs.open("matching_searches.txt", "w", "utf-8") as matching_searches_out: ix = open_dir("index_for_sample_files") with ix.searcher() as searcher: for trigram in tom_jones_trigrams: phrase_query = Phrase("full_text", trigram) results = searcher.search(phrase_query) results.fragmenter.charlimit = None results.scorer = CustomScorer(phrase_query) for hit in results: # We've identified at least one hit in our index. Whoosh contains a built-in # set of tools we can use to "highlight" those hits, but we can also grep the # files with hits to extract the matching string in context file_with_hit = hit["path"] author_of_hit_file = hit["author"] title_of_hit_file = hit["short_title"] with codecs.open(hit["path"], "r", "utf-8") as fileobj: filecontents = fileobj.read() hit_highlights = hit.highlights("full_text",