예제 #1
0
def test_span_characters():
    ix = get_index()
    with ix.searcher() as s:
        pq = Phrase("text", ["bravo", "echo"])
        m = pq.matcher(s)
        while m.is_active():
            orig = " ".join(s.stored_fields(m.id())["text"])
            for span in m.spans():
                startchar, endchar = span.startchar, span.endchar
                assert orig[startchar:endchar] == "bravo echo"
            m.next()
예제 #2
0
def test_boost_phrase():
    schema = fields.Schema(title=fields.TEXT(field_boost=5.0, stored=True), text=fields.TEXT)
    ix = RamStorage().create_index(schema)
    domain = u("alfa bravo charlie delta").split()
    w = ix.writer()
    for ls in permutations(domain):
        t = u(" ").join(ls)
        w.add_document(title=t, text=t)
    w.commit()

    q = Or([Term("title", u("alfa")), Term("title", u("bravo")), Phrase("text", [u("bravo"), u("charlie"), u("delta")])])

    def boost_phrases(q):
        if isinstance(q, Phrase):
            q.boost *= 1000.0
            return q
        else:
            return q.apply(boost_phrases)
    q = boost_phrases(q)

    with ix.searcher() as s:
        r = s.search(q, limit=None)
        for hit in r:
            if "bravo charlie delta" in hit["title"]:
                assert hit.score > 100.0
예제 #3
0
    def search(self, q: str):
        res = None

        with self.ix.searcher() as searcher:
            query = Phrase(self.search_index, self.process_query(q))
            #query = QueryParser(self.search_index, self.ix.schema).parse(self.process_query(q))
            results = searcher.search(query)
            res = list(results)
            res = [item.fields() for item in res]

        return res
예제 #4
0
def test_phrase_score():
    schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT)
    storage = RamStorage()
    ix = storage.create_index(schema)
    writer = ix.writer()
    writer.add_document(name=u("A"), value=u("Little Miss Muffet sat on a tuffet"))
    writer.add_document(name=u("D"), value=u("Gibberish blonk falunk miss muffet sat tuffet garbonzo"))
    writer.add_document(name=u("E"), value=u("Blah blah blah pancakes"))
    writer.add_document(name=u("F"), value=u("Little miss muffet little miss muffet"))
    writer.commit()

    with ix.searcher() as s:
        q = Phrase("value", [u("little"), u("miss"), u("muffet")])
        m = q.matcher(s)
        assert_equal(m.id(), 0)
        score1 = m.weight()
        assert score1 > 0
        m.next()
        assert_equal(m.id(), 3)
        assert m.weight() > score1
예제 #5
0
 def re_weight_query(self, query, terms):
     print(terms)
     weights = [
         1 - (0.9 * ((i + 1) / len(terms))) for i, t in enumerate(terms)
     ]
     expanded_query = query.with_boost(1)
     for i, w in enumerate(weights):
         tokens = [i.text for i in self.query_analyzer(terms[i])]
         print(terms[i], tokens)
         q = Phrase('text', tokens, 3).with_boost(w)
         expanded_query = expanded_query | q
     return expanded_query
예제 #6
0
def test_accept():
    def boost_phrases(q):
        if isinstance(q, Phrase):
            q.boost *= 2.0
        return q

    before = And([
        Term("a", u("b")),
        Or([Term("c", u("d")),
            Phrase("a", [u("e"), u("f")])]),
        Phrase("a", [u("g"), u("h")], boost=0.25)
    ])
    after = before.accept(boost_phrases)
    assert_equal(
        after,
        And([
            Term("a", u("b")),
            Or([Term("c", u("d")),
                Phrase("a", [u("e"), u("f")], boost=2.0)]),
            Phrase("a", [u("g"), u("h")], boost=0.5)
        ]))

    before = Phrase("a", [u("b"), u("c")], boost=2.5)
    after = before.accept(boost_phrases)
    assert_equal(after, Phrase("a", [u("b"), u("c")], boost=5.0))
예제 #7
0
def test_phrase_sameword():
    schema = fields.Schema(id=fields.STORED, text=fields.TEXT)
    storage = RamStorage()
    ix = storage.create_index(schema)

    writer = ix.writer()
    writer.add_document(id=1, text=u("The film Linda Linda Linda is good"))
    writer.add_document(id=2, text=u("The model Linda Evangelista is pretty"))
    writer.commit()

    with ix.searcher() as s:
        r = s.search(Phrase("text", ["linda", "linda", "linda"]), limit=None)
        assert_equal(len(r), 1)
        assert_equal(r[0]["id"], 1)
예제 #8
0
def test_phrase_order():
    tfield = fields.TEXT(stored=True, analyzer=analysis.SimpleAnalyzer())
    schema = fields.Schema(text=tfield)
    storage = RamStorage()
    ix = storage.create_index(schema)

    writer = ix.writer()
    for ls in permutations(["ape", "bay", "can", "day"], 4):
        writer.add_document(text=u(" ").join(ls))
    writer.commit()

    with ix.searcher() as s:
        def result(q):
            r = s.search(q, limit=None, sortedby=None)
            return sorted([d['text'] for d in r])

        q = Phrase("text", ["bay", "can", "day"])
        assert_equal(result(q), [u('ape bay can day'), u('bay can day ape')])
예제 #9
0
def test_phrase_multi():
    schema = fields.Schema(id=fields.STORED, text=fields.TEXT)
    ix = RamStorage().create_index(schema)

    domain = u("alfa bravo charlie delta echo").split()
    w = None
    for i, ls in enumerate(permutations(domain)):
        if w is None:
            w = ix.writer()
        w.add_document(id=i, text=u(" ").join(ls))
        if not i % 30:
            w.commit()
            w = None
    if w is not None:
        w.commit()

    with ix.searcher() as s:
        q = Phrase("text", ["alfa", "bravo"])
        _ = s.search(q)
예제 #10
0
    def __build_search_query(self, keywords):
        keyword_list_no_stopw = kwlist_no_stopwords(keywords)
        keyword_list_stemmed = kwlist_stemmed(keywords)

        # build the terms to search
        fields_to_search = ['fieldname', 'fieldname_processed_parents',
                            'fieldname_processed_current',
                            'fieldname_current', 'title']

        all_fields_and_terms = [Term(f, kw) for kw in keyword_list_no_stopw
                                for f in fields_to_search]
        all_fields_and_terms.extend([Term('title_stemmed', kw)
                                     for kw in keyword_list_stemmed])

        if keyword_list_no_stopw and len(keyword_list_no_stopw) > 1:
            # noinspection PyTypeChecker
            all_fields_and_terms.append(Phrase('title', keyword_list_no_stopw,
                                               boost=2, slop=1))
        return all_fields_and_terms
예제 #11
0
    def __build_search_query(cls, keywords):
        """
        prepares the search query over IR engine
        """
        keyword_list_no_stopw = kwlist_no_stopwords(keywords)
        keyword_list_stemmed = kwlist_stemmed(keywords)

        fields_to_search = ['fieldname', 'fieldname_processed_parents',
                            'fieldname_processed_current']

        all_fields_and_terms = [Term(f, kw) for kw in
                                keyword_list_stemmed
                                for f in fields_to_search]
        all_fields_and_terms.extend([Term('title_stemmed', kw)
                                     for kw in keyword_list_stemmed])
        # Note: WE no not search by stopwords....
        # here phrase also uses title_stemmed, it will match title stemmed,
        #  so score boost shouldn't be too high
        if keyword_list_no_stopw and len(keyword_list_no_stopw) > 1:
            # noinspection PyTypeChecker
            all_fields_and_terms.append(
                Phrase('title_stemmed', keyword_list_stemmed,
                       boost=1.5, slop=1))
        return all_fields_and_terms
예제 #12
0
def test_query_copy_hash():
    def do(q1, q2):
        q1a = copy.deepcopy(q1)
        assert_equal(q1, q1a)
        assert_equal(hash(q1), hash(q1a))
        assert_not_equal(q1, q2)

    do(Term("a", u("b"), boost=1.1), Term("a", u("b"), boost=1.5))
    do(And([Term("a", u("b")), Term("c", u("d"))], boost=1.1),
       And([Term("a", u("b")), Term("c", u("d"))], boost=1.5))
    do(Or([Term("a", u("b"), boost=1.1), Term("c", u("d"))]),
       Or([Term("a", u("b"), boost=1.8), Term("c", u("d"))], boost=1.5))
    do(DisjunctionMax([Term("a", u("b"), boost=1.8), Term("c", u("d"))]),
       DisjunctionMax([Term("a", u("b"), boost=1.1), Term("c", u("d"))],
                      boost=1.5))
    do(Not(Term("a", u("b"), boost=1.1)), Not(Term("a", u("b"), boost=1.5)))
    do(Prefix("a", u("b"), boost=1.1), Prefix("a", u("b"), boost=1.5))
    do(Wildcard("a", u("b*x?"), boost=1.1), Wildcard("a", u("b*x?"),
                                                     boost=1.5))
    do(FuzzyTerm("a", u("b"), constantscore=True),
       FuzzyTerm("a", u("b"), constantscore=False))
    do(FuzzyTerm("a", u("b"), boost=1.1), FuzzyTerm("a", u("b"), boost=1.5))
    do(TermRange("a", u("b"), u("c")), TermRange("a", u("b"), u("d")))
    do(TermRange("a", None, u("c")), TermRange("a", None, None))
    do(TermRange("a", u("b"), u("c"), boost=1.1),
       TermRange("a", u("b"), u("c"), boost=1.5))
    do(TermRange("a", u("b"), u("c"), constantscore=True),
       TermRange("a", u("b"), u("c"), constantscore=False))
    do(NumericRange("a", 1, 5), NumericRange("a", 1, 6))
    do(NumericRange("a", None, 5), NumericRange("a", None, None))
    do(NumericRange("a", 3, 6, boost=1.1), NumericRange("a", 3, 6, boost=1.5))
    do(NumericRange("a", 3, 6, constantscore=True),
       NumericRange("a", 3, 6, constantscore=False))
    # do(DateRange)
    do(Variations("a", u("render")), Variations("a", u("renders")))
    do(Variations("a", u("render"), boost=1.1),
       Variations("a", u("renders"), boost=1.5))
    do(Phrase("a", [u("b"), u("c"), u("d")]),
       Phrase("a", [u("b"), u("c"), u("e")]))
    do(Phrase("a", [u("b"), u("c"), u("d")], boost=1.1),
       Phrase("a", [u("b"), u("c"), u("d")], boost=1.5))
    do(Phrase("a", [u("b"), u("c"), u("d")], slop=1),
       Phrase("a", [u("b"), u("c"), u("d")], slop=2))
    # do(Ordered)
    do(Every(), Every("a"))
    do(Every("a"), Every("b"))
    do(Every("a", boost=1.1), Every("a", boost=1.5))
    do(NullQuery, Term("a", u("b")))
    do(ConstantScoreQuery(Term("a", u("b"))),
       ConstantScoreQuery(Term("a", u("c"))))
    do(ConstantScoreQuery(Term("a", u("b")), score=2.0),
       ConstantScoreQuery(Term("a", u("c")), score=2.1))
    do(Require(Term("a", u("b")), Term("c", u("d"))),
       Require(Term("a", u("b"), boost=1.1), Term("c", u("d"))))
    # do(Require)
    # do(AndMaybe)
    # do(AndNot)
    # do(Otherwise)

    do(SpanFirst(Term("a", u("b")), limit=1), SpanFirst(Term("a", u("b")),
                                                        limit=2))
    do(SpanNear(Term("a", u("b")), Term("c", u("d"))),
       SpanNear(Term("a", u("b")), Term("c", u("e"))))
    do(SpanNear(Term("a", u("b")), Term("c", u("d")), slop=1),
       SpanNear(Term("a", u("b")), Term("c", u("d")), slop=2))
    do(SpanNear(Term("a", u("b")), Term("c", u("d")), mindist=1),
       SpanNear(Term("a", u("b")), Term("c", u("d")), mindist=2))
    do(SpanNear(Term("a", u("b")), Term("c", u("d")), ordered=True),
       SpanNear(Term("a", u("b")), Term("c", u("d")), ordered=False))
    do(SpanNot(Term("a", u("b")), Term("a", u("c"))),
       SpanNot(Term("a", u("b")), Term("a", u("d"))))
    do(SpanOr([Term("a", u("b")), Term("a", u("c")), Term("a", u("d"))]),
       SpanOr([Term("a", u("b")), Term("a", u("c")), Term("a", u("e"))]))
    do(SpanContains(Term("a", u("b")), Term("a", u("c"))),
       SpanContains(Term("a", u("b")), Term("a", u("d"))))
예제 #13
0
def test_posting_phrase():
    schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT)
    storage = RamStorage()
    ix = storage.create_index(schema)
    writer = ix.writer()
    writer.add_document(name=u("A"), value=u("Little Miss Muffet sat on a tuffet"))
    writer.add_document(name=u("B"), value=u("Miss Little Muffet tuffet"))
    writer.add_document(name=u("C"), value=u("Miss Little Muffet tuffet sat"))
    writer.add_document(name=u("D"), value=u("Gibberish blonk falunk miss muffet sat tuffet garbonzo"))
    writer.add_document(name=u("E"), value=u("Blah blah blah pancakes"))
    writer.commit()

    with ix.searcher() as s:
        def names(results):
            return sorted([fields['name'] for fields in results])

        q = Phrase("value", [u("little"), u("miss"), u("muffet"), u("sat"), u("tuffet")])
        m = q.matcher(s)
        assert_equal(m.__class__.__name__, "SpanNearMatcher")

        r = s.search(q)
        assert_equal(names(r), ["A"])
        assert_equal(len(r), 1)

        q = Phrase("value", [u("miss"), u("muffet"), u("sat"), u("tuffet")])
        assert_equal(names(s.search(q)), ["A", "D"])

        q = Phrase("value", [u("falunk"), u("gibberish")])
        r = s.search(q)
        assert_equal(names(r), [])
        assert_equal(len(r), 0)

        q = Phrase("value", [u("gibberish"), u("falunk")], slop=2)
        assert_equal(names(s.search(q)), ["D"])

        q = Phrase("value", [u("blah")] * 4)
        assert_equal(names(s.search(q)), [])  # blah blah blah blah

        q = Phrase("value", [u("blah")] * 3)
        m = q.matcher(s)
        assert_equal(names(s.search(q)), ["E"])
예제 #14
0
'''
Created on Oct 27, 2014

@author: Cassie
'''
from whoosh import index
from whoosh.qparser import QueryParser
from whoosh.query import And, Term, Not, FuzzyTerm, Phrase

ix = index.open_dir("index")

q1 = And([
    Term("city_text", u"greek"),
    Term("city_text", u"roman"),
    Not(Term("city_text", u"persian"))
])
q2 = FuzzyTerm("city_text", u"shakespeare")
q3 = Phrase("city_text", [u"located", u"below", u"sea", u"level"], slop=10)

with ix.searcher() as s:
    results = s.search(q2, limit=None)
    for a in results:
        print a['city_name']
예제 #15
0
    def search(self, text, configuration):
        # Default query object
        t = [i.text for i in self.query_analyzer(text)]
        query = self.parser.parse(text) if len(t) <= 1 else Phrase(
            'text', t, slop=1)
        logger.info(repr(query))
        logger.info(repr(nltk.pos_tag(nltk.word_tokenize(text))))

        # print(extract(word_tokenize(text)))

        # Default results object
        results = []

        # Default query limit
        limit = 10

        if 'results_limit' in configuration and int(
                configuration['results_limit']) > 0:
            limit = int(configuration['results_limit'])

        # Default Query Expansion
        expansion = 'lca'
        expansion_threshold = 1.4
        expansion_terms = self._query_expansion_terms

        if 'query_expansion' in configuration:
            expansion = configuration['query_expansion']

        # Default Ranking
        model = MODELS['bm25']
        searcher = self.searcher['bm25']

        if 'ranking' in configuration and configuration['ranking'] and MODELS[
                configuration['ranking']]:
            model = MODELS[configuration['ranking']]
            searcher = self.searcher[configuration['ranking']]

        # Default Link Analysis
        link_analysis = False
        facet = lambda result: result.score

        if 'link_analysis' in configuration and configuration[
                'link_analysis'] != 'none':
            link_analysis = configuration['link_analysis']

            if link_analysis == 'hits_rank':
                results = searcher.search(
                    query, limit=self._hits_rank_relevant_window)
                auths, hubs = self.hitsrank.rank_from_results(results)
                facet = hits_rank_facet(auths, hubs)

            if link_analysis == 'page_rank':
                facet = page_rank_facet(self.pagerank)

        try:
            results = []
            print('* limit ', limit)
            print('* model: ', model.__name__)
            print('* expansion model: ', expansion)
            print('* link analysis: ', link_analysis)

            if expansion != 'none' and expansion is not False:
                if expansion == 'lca':
                    # if link_analysis:
                    #     results = searcher.search(query, limit=self._page_rank_relevant_window)
                    #     results = sorted(results, key=facet, reverse=True)[:self._query_expansion_relevant_limit]
                    #     expansion_threshold = 1.005
                    # else:
                    results = searcher.search(
                        query, limit=self._query_expansion_relevant_limit)
                    if len(results) >= self._query_expansion_relevant_limit:
                        terms = lca_expand(query,
                                           results,
                                           size=20,
                                           threshold=1.288)
                        # print(terms)
                        expanded_query = self.re_weight_query(query, terms)
                        logging.info(repr(expanded_query))
                        results = searcher.search(expanded_query, limit=limit)
                elif expansion == 'thesaurus':
                    terms = thesaurus_expand(
                        text,
                        self.wikimedia,
                        size=10,
                        threshold=self.query_expansion_thesaurus_threshold)
                    # print(terms)
                    expanded_query = self.re_weight_query(query, terms)
                    logging.info(repr(expanded_query))
                    results = searcher.search(expanded_query, limit=limit)
            else:
                results = searcher.search(query, limit=limit)

            results = sorted(results, key=facet, reverse=True)
            return [r.Result(i, query) for i in results]

        except Exception as e:
            logger.error(e)
            return []
with codecs.open("tom_jones.txt", "w", "utf-8") as tom_jones_out:
    tom_jones_out.write(html)

# 6. Search the index:
with codecs.open("tom_jones.txt", "r", "utf-8") as tom_jones_in:
    tom_jones_trigrams = ngrams(
        tom_jones_in.read().replace("\r", "").replace("\n", " ").split(
            "PROJECT GUTENBERG")[1].split("PROJECT GUTENBERG")[0].split(), 3)

with codecs.open("matching_searches.txt", "w",
                 "utf-8") as matching_searches_out:
    ix = open_dir("index_for_sample_files")
    with ix.searcher() as searcher:
        for trigram in tom_jones_trigrams:

            phrase_query = Phrase("full_text", trigram)
            results = searcher.search(phrase_query)
            results.fragmenter.charlimit = None
            results.scorer = CustomScorer(phrase_query)
            for hit in results:

                # We've identified at least one hit in our index. Whoosh contains a built-in
                # set of tools we can use to "highlight" those hits, but we can also grep the
                # files with hits to extract the matching string in context
                file_with_hit = hit["path"]
                author_of_hit_file = hit["author"]
                title_of_hit_file = hit["short_title"]

                with codecs.open(hit["path"], "r", "utf-8") as fileobj:
                    filecontents = fileobj.read()
                    hit_highlights = hit.highlights("full_text",