Exemplo n.º 1
0
def hyper_parameter_evaluate():
    ks = [0.25, 0.75, 1.25, 2]
    bs = [i / 10 for i in range(11)]
    with open('ndcg_bm25_abstract.csv', 'w') as file:
        csv_writer = csv.writer(file)
        for k in ks:
            # Single-field BM25
            # scoring_function = BM25F(K1 = k)
            # search_function = search_single_field
            # result = evaluate(search_function, scoring_function)
            for titles_b in bs:
                for caption_and_headers_b in bs:
                    for body_b in bs:
                        # BM25F
                        scoring_function = BM25F(
                            K1=k,
                            titles_B=titles_b,
                            caption_and_headers_B=caption_and_headers_b,
                            body_B=body_b)
                        search_function = search_bm25f_or
                        result = evaluate(search_function, scoring_function)
                        row = [k, result]
                        csv_writer.writerow(row)
                        print(row)


# Do the evaluation.
#hyper_parameter_evaluate()
Exemplo n.º 2
0
 def search(self, q, tool_name_boost, tool_section_boost,
            tool_description_boost, tool_label_boost, tool_stub_boost,
            tool_help_boost, tool_search_limit):
     """
     Perform search on the in-memory index. Weight in the given boosts.
     """
     # Change field boosts for searcher
     searcher = self.index.searcher(weighting=BM25F(
         field_B={
             'name_B': float(tool_name_boost),
             'section_B': float(tool_section_boost),
             'description_B': float(tool_description_boost),
             'labels_B': float(tool_label_boost),
             'stub_B': float(tool_stub_boost),
             'help_B': float(tool_help_boost)
         }))
     # Set query to search name, description, section, help, and labels.
     parser = MultifieldParser(
         ['name', 'description', 'section', 'help', 'labels', 'stub'],
         schema=self.schema)
     # Hyphens are wildcards in Whoosh causing bad things
     if q.find('-') != -1:
         q = (' ').join([token.text for token in self.rex(to_unicode(q))])
     # Perform the search
     hits = searcher.search(parser.parse('*' + q + '*'),
                            limit=float(tool_search_limit))
     return [hit['id'] for hit in hits]
Exemplo n.º 3
0
def searching(idx, query, limit=10):
    bm = BM25F()
    searcher = idx.searcher(weighting=bm)
    qp = QueryParser("text", schema=idx.schema)
    q = qp.parse(query)
    results = searcher.search(q, limit=limit)
    return results
Exemplo n.º 4
0
	def search(self, queryEntered, page):
		title    = list()
		plot     = list()
		poster   = list()
		year     = list()
		director = list()
		genre    = list()
		actors   = list()
		tomato_score = list()

 		# JY for the sake of demonstrating ranking weight, not going to affect search much visibly. 
		#mw=MultiWeighting(BM25F(), tomato_score=FunctionWeighting(custom_weight)) # plot=BM25F(B=0.75, plot_B=1.0, K1=2.0), actors=BM25F(B=0.75, actors_B=1.0, K1=1.5), director=TF_IDF()  )
		with self.indexer.searcher(weighting=BM25F()) as search: 
			parser = MultifieldParser(['title', 'plot','actors', 'director', 'genre'], schema=self.indexer.schema, termclass=FuzzyTerm) #
			parser.add_plugin(plugins.FuzzyTermPlugin())
			parser.add_plugin(plugins.SequencePlugin())
			query = parser.parse(queryEntered)
			results = search.search_page(query, page, 20, sortedby = {'tomato_score'}, reverse=True) # 'tomato_score', 'year'

			for x in results:
				title.append(x['title'])
				plot.append(x['plot'])
				poster.append(x['poster'])
				tomato_score.append(x['tomato_score'])
				year.append(x['year'])
				director.append(x['director'])
				actors.append(x['actors'])
				genre.append(x['genre'])

		return title, plot, poster, tomato_score, year, actors, director, genre, results.pagecount if results.pagecount < 23 else 23 
Exemplo n.º 5
0
 def search( self, query, return_attribute='id' ):
     # Change field boosts for searcher to place more weight on title, description than help.
     searcher = self.index.searcher( \
                     weighting=BM25F( field_B={ 'title_B' : 3, 'description_B' : 2, 'help_B' : 1 } \
                                 ) )
     # Set query to search title, description, and help.
     parser = MultifieldParser( [ 'title', 'description', 'help' ], schema = schema )
     results = searcher.search( parser.parse( query ) )
     return [ result[ return_attribute ] for result in results ]
Exemplo n.º 6
0
    def search(self, q: str, tool_name_boost: CanConvertToFloat,
               tool_id_boost: CanConvertToFloat,
               tool_section_boost: CanConvertToFloat,
               tool_description_boost: CanConvertToFloat,
               tool_label_boost: CanConvertToFloat,
               tool_stub_boost: CanConvertToFloat,
               tool_help_boost: CanConvertToFloat,
               tool_search_limit: CanConvertToFloat,
               tool_enable_ngram_search: bool,
               tool_ngram_minsize: CanConvertToInt,
               tool_ngram_maxsize: CanConvertToInt) -> List[str]:
        """
        Perform search on the in-memory index. Weight in the given boosts.
        """
        # Change field boosts for searcher
        self.searcher = self.index.searcher(weighting=MultiWeighting(
            BM25F(),
            old_id=BM25F(old_id_B=float(tool_id_boost)),
            name=BM25F(name_B=float(tool_name_boost)),
            section=BM25F(section_B=float(tool_section_boost)),
            description=BM25F(description_B=float(tool_description_boost)),
            labels=BM25F(labels_B=float(tool_label_boost)),
            stub=BM25F(stub_B=float(tool_stub_boost)),
            help=BM25F(help_B=float(tool_help_boost))))
        # Use OrGroup to change the default operation for joining multiple terms to logical OR.
        # This means e.g. for search 'bowtie of king arthur' a document that only has 'bowtie' will be a match.
        # https://whoosh.readthedocs.io/en/latest/api/qparser.html#whoosh.qparser.MultifieldPlugin
        # However this changes scoring i.e. searching 'bowtie of king arthur' a document with 'arthur arthur arthur'
        # would have a higher score than a document with 'bowtie arthur' which is usually unexpected for a user.
        # Hence we introduce a bonus on multi-hits using the 'factory()' method using a scaling factor between 0-1.
        # https://whoosh.readthedocs.io/en/latest/parsing.html#searching-for-any-terms-instead-of-all-terms-by-default
        # Adding the FuzzyTermPlugin to account for misspellings and typos, using a max distance of 2
        og = OrGroup.factory(0.9)
        self.parser = MultifieldParser([
            'name', 'old_id', 'description', 'section', 'help', 'labels',
            'stub'
        ],
                                       schema=self.schema,
                                       group=og)

        cleaned_query = q.lower()
        if tool_enable_ngram_search is True:
            rval = self._search_ngrams(cleaned_query, tool_ngram_minsize,
                                       tool_ngram_maxsize, tool_search_limit)
            return rval
        else:
            cleaned_query = ' '.join(token.text
                                     for token in self.rex(cleaned_query))
            # Use asterisk Whoosh wildcard so e.g. 'bow' easily matches 'bowtie'
            parsed_query = self.parser.parse(f"*{cleaned_query}*")
            hits = self.searcher.search(parsed_query,
                                        limit=float(tool_search_limit),
                                        sortedby='')
            return [hit['id'] for hit in hits]
Exemplo n.º 7
0
 def search( self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_label_boost, tool_stub_boost, tool_help_boost, tool_search_limit, tool_enable_ngram_search, tool_ngram_minsize, tool_ngram_maxsize ):
     """
     Perform search on the in-memory index. Weight in the given boosts.
     """
     # Change field boosts for searcher
     searcher = self.index.searcher(
         weighting=BM25F(
             field_B={ 'name_B': float( tool_name_boost ),
                       'section_B': float( tool_section_boost ),
                       'description_B': float( tool_description_boost ),
                       'labels_B': float( tool_label_boost ),
                       'stub_B': float( tool_stub_boost ),
                       'help_B': float( tool_help_boost ) }
         )
     )
     # Set query to search name, description, section, help, and labels.
     parser = MultifieldParser( [ 'name', 'description', 'section', 'help', 'labels', 'stub' ], schema=self.schema )
     # Hyphens are wildcards in Whoosh causing bad things
     if q.find( '-' ) != -1:
         q = (' ').join( [ token.text for token in self.rex( to_unicode( q ) ) ] )
     # Perform tool search with ngrams if set to true in the config file
     if ( tool_enable_ngram_search is True or tool_enable_ngram_search == "True" ):
         hits_with_score = {}
         token_analyzer = StandardAnalyzer() | analysis.NgramFilter( minsize=int( tool_ngram_minsize ), maxsize=int( tool_ngram_maxsize ) )
         ngrams = [ token.text for token in token_analyzer( q ) ]
         for query in ngrams:
             # Get the tool list with respective scores for each qgram
             curr_hits = searcher.search( parser.parse( '*' + query + '*' ), limit=float( tool_search_limit ) )
             for i, curr_hit in enumerate( curr_hits ):
                 is_present = False
                 for prev_hit in hits_with_score:
                     # Check if the tool appears again for the next qgram search
                     if curr_hit[ 'id' ] == prev_hit:
                         is_present = True
                         # Add the current score with the previous one if the
                         # tool appears again for the next qgram
                         hits_with_score[ prev_hit ] = curr_hits.score(i) + hits_with_score[ prev_hit ]
                 # Add the tool if not present to the collection with its score
                 if not is_present:
                     hits_with_score[ curr_hit[ 'id' ] ] = curr_hits.score(i)
         # Sort the results based on aggregated BM25 score in decreasing order of scores
         hits_with_score = sorted( hits_with_score.items(), key=lambda x: x[1], reverse=True )
         # Return the tool ids
         return [ item[0] for item in hits_with_score[ 0:int( tool_search_limit ) ] ]
     else:
         # Perform the search
         hits = searcher.search( parser.parse( '*' + q + '*' ), limit=float( tool_search_limit ) )
         return [ hit[ 'id' ] for hit in hits ]
Exemplo n.º 8
0
    def search(self, query, return_attribute='id'):
        # Change field boosts for searcher
        searcher = self.index.searcher(weighting=BM25F(
            field_B={
                'title_B': 9,
                'section_B': 3,
                'description_B': 2,
                'help_B': 0.5
            }))
        # Set query to search title, description, section, and help.
        parser = MultifieldParser(['title', 'description', 'section', 'help'],
                                  schema=schema)
        # Perform the search
        hits = searcher.search(parser.parse('*' + query + '*'), limit=20)

        return [hit[return_attribute] for hit in hits]
Exemplo n.º 9
0
 def search(self, q, tool_name_boost, tool_section_boost,
            tool_description_boost, tool_label_boost, tool_stub_boost,
            tool_help_boost, tool_search_limit, tool_enable_ngram_search,
            tool_ngram_minsize, tool_ngram_maxsize):
     """
     Perform search on the in-memory index. Weight in the given boosts.
     """
     # Change field boosts for searcher
     self.searcher = self.index.searcher(weighting=BM25F(
         field_B={
             'name_B': float(tool_name_boost),
             'section_B': float(tool_section_boost),
             'description_B': float(tool_description_boost),
             'labels_B': float(tool_label_boost),
             'stub_B': float(tool_stub_boost),
             'help_B': float(tool_help_boost)
         }))
     # Use OrGroup to change the default operation for joining multiple terms to logical OR.
     # This means e.g. for search 'bowtie of king arthur' a document that only has 'bowtie' will be a match.
     # https://whoosh.readthedocs.io/en/latest/api/qparser.html#whoosh.qparser.MultifieldPlugin
     # However this changes scoring i.e. searching 'bowtie of king arthur' a document with 'arthur arthur arthur'
     # would have a higher score than a document with 'bowtie arthur' which is usually unexpected for a user.
     # Hence we introduce a bonus on multi-hits using the 'factory()' method using a scaling factor between 0-1.
     # https://whoosh.readthedocs.io/en/latest/parsing.html#searching-for-any-terms-instead-of-all-terms-by-default
     og = OrGroup.factory(0.9)
     self.parser = MultifieldParser(
         ['name', 'description', 'section', 'help', 'labels', 'stub'],
         schema=self.schema,
         group=og)
     cleaned_query = q.lower()
     # Replace hyphens, since they are wildcards in Whoosh causing false positives
     if cleaned_query.find('-') != -1:
         cleaned_query = (' ').join(
             token.text for token in self.rex(to_unicode(cleaned_query)))
     if tool_enable_ngram_search is True:
         rval = self._search_ngrams(cleaned_query, tool_ngram_minsize,
                                    tool_ngram_maxsize, tool_search_limit)
         return rval
     else:
         # Use asterisk Whoosh wildcard so e.g. 'bow' easily matches 'bowtie'
         parsed_query = self.parser.parse(cleaned_query + '*')
         hits = self.searcher.search(parsed_query,
                                     limit=float(tool_search_limit),
                                     sortedby='')
         return [hit['id'] for hit in hits]
Exemplo n.º 10
0
 def search( self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_help_boost, tool_search_limit ):
     """
     Perform search on the in-memory index. Weight in the given boosts.
     """
     # Change field boosts for searcher
     searcher = self.index.searcher(
         weighting=BM25F(
             field_B={ 'name_B': float( tool_name_boost ),
                       'section_B': float( tool_section_boost ),
                       'description_B': float( tool_description_boost ),
                       'help_B': float( tool_help_boost ) }
         )
     )
     # Set query to search name, description, section, and help.
     parser = MultifieldParser( [ 'name', 'description', 'section', 'help' ], schema=schema )
     # Perform the search
     hits = searcher.search( parser.parse( '*' + q + '*' ), limit=float( tool_search_limit ) )
     return [ hit[ 'id' ] for hit in hits ]
Exemplo n.º 11
0
def get_searcher(index=INDEX, score_by="BM25F"):
    """
    get_searcher([index=INDEX, score_by="RTE"])
    
    Obtém o buscador para o índice fornecido no parâmetro ``index``.
    O parâmetro ``score_by`` permite a escolha de uma função de pontuação diferente
    para o par (query, documento). As funções de pontuação disponíveis são:
    RTE, TFIDF e BM25F.
     
    .. code-block:: python
    
        >>> from searcher import get_searcher
        >>> from index import get_index
        >>>
        >>> idx = get_index()
        >>> searcher = get_searcher(idx, score_by="RTE")
        >>>
        
    :param index: Índice de documentos.
    :type index: FileIndex
    :param score_by: Função de pontuação entre a *query* do usuário e um documento recuperado.
    :type score_by: str
    :returns: Searcher
    """

    try:
        from whoosh.scoring import BM25F, TF_IDF
    except ImportError:
        print "Ocorreu um erro na importação das funções de pontuação."

    # Converte para MAIÚSCULO.
    score_by = score_by.upper()
    # Escolha da função de pontuação.

    if score_by == "TF-IDF":
        score_function = TF_IDF()
    elif score_by == "BM25F":
        score_function = BM25F()

    return index.searcher(weighting=score_function)
Exemplo n.º 12
0
def search_index(query, score_func_name, dirname):
    ix = index.open_dir(dirname, schema=get_schema())
    og = OrGroup.factory(0.9)
    qp = QueryParser("content", schema=get_schema(), group=og)
    # qp.add_plugin(FuzzyTermPlugin())
    # query = ' '.join([(x + '~' if len(x) > 5 else x) for x in query.split(' ')])
    q = qp.parse(query)
    score_func = OkBM25()
    if score_func_name == 'ok':
        score_func = OkBM25()
    elif score_func_name == 'bm25f':
        score_func = BM25F()
    elif score_func_name == 'pln':
        score_func = PLN()
    elif score_func_name == 'tfidf':
        score_func = TF_IDF()
    elif score_func_name == 'freq':
        score_func = Frequency()
    searcher = ix.searcher(weighting=score_func)
    results = searcher.search(q, limit=None)
    results.fragmenter.surround = 100
    return results
Exemplo n.º 13
0
 def scorer(self, searcher, fieldname, text, qf=1):
     # BM25
     bm25Scorer = BM25F().scorer(searcher, fieldname, text, qf)
     tfidfScorer = TF_IDF().scorer(searcher, fieldname, text, qf)
     return self.Scorer(tfidfScorer, bm25Scorer)
Exemplo n.º 14
0
elif (operation_type == "OR"):
    op_type = qparser.OrGroup
else:
    op_type = qparser.AndGroup

dirname = "indexdir"
ix = open_dir(dirname)
qp = qparser.MultifieldParser(
    ['content', 'path', 'title', 'head1', 'head2', 'head3', 'head4'],
    ix.schema,
    group=op_type)
qp.add_plugin(qparser.PlusMinusPlugin)
query = qp.parse(search_input)
# print(query)
if search_type == "BM25":
    w = BM25F(B=0.75, K1=1.5)
elif search_type == "TFIDF":
    w = TF_IDF()
else:
    w = BM25F(
        B=0.75,
        K1=1.5,
    )
with ix.searcher(weighting=w) as searcher:
    results = searcher.search(query, terms=True)
    results.fragmenter = highlight.ContextFragmenter(
        maxchars=50,
        surround=50,
    )
    # print(list(searcher.lexicon("content")))
    found_doc_num = results.scored_length()
from evaluation_BM25 import evaluate
from whoosh.scoring import BM25F
from search import search_bm25f_and, search_bm25f_or

scoring_function = BM25F(K1=1.5,
                         titles_B=0.9,
                         caption_and_headers_B=0.9,
                         body_B=0.2)
search_function_and = search_bm25f_and
search_function_or = search_bm25f_or
result_and = evaluate(search_function_and, scoring_function)
result_or = evaluate(search_function_or, scoring_function)
print("Result for and: ")
print(result_and)
print("Result for or ")
print(result_or)
Exemplo n.º 16
0
        writer.add_document(path=link['href'],
                            title=filename[:-4],
                            content=f_string)
    except:
        writer.add_document(path=u'None',
                            title=filename[:-4],
                            content=f_string)

writer.commit()
qp = qparser.MultifieldParser(['content', 'path', 'title'],
                              ix.schema,
                              group=qparser.OrGroup)
query = qp.parse("transgenic growth ")
# print(query)

b = BM25F(B=0.75, K1=1.5)
t = TF_IDF()
f = Frequency()
with ix.searcher(weighting=f) as searcher:
    results = searcher.search(
        query,
        terms=True,
    )
    results.fragmenter = highlight.ContextFragmenter(
        maxchars=50,
        surround=90,
    )

    if results:
        for hit in results:
            snip = hit.highlights('content')