示例#1
0
    def scoring_results(self, input_query, number_of_results):
        ix = index.open_dir(self.path + '/' + str(self.analyzer))
        #check the scoring parameter and set the scoring_function accordingily
        if self.scoring is 'frequency':
            scoring_function = scoring.Frequency()
        elif self.scoring is 'tf_idf':
            scoring_function = scoring.TF_IDF()
        elif self.scoring is 'bm25f_1':
            scoring_function = scoring.BM25F(B=0.35, K1=0.7)
        elif self.scoring is 'bm25f_2':
            scoring_function = scoring.BM25F(B=0.75, K1=1.2)
        elif self.scoring is 'bm25f_3':
            scoring_function = scoring.BM25F(B=0.75, K1=2.3)
        elif self.scoring is 'bm25f_4':
            scoring_function = scoring.BM25F(B=0.9, K1=1.1)
        else:
            print('scoring method not found')

        qp = QueryParser("content", ix.schema)
        persed_query = qp.parse(input_query)  # parsing the query
        searcher = ix.searcher(weighting=scoring_function)
        # execute the search
        results = searcher.search(persed_query, limit=number_of_results)
        rr = []
        rank = 0
        # loop over search results
        for hit in results:
            rank += 1
            rr.append([hit['indx'], rank])

        # close searcher
        searcher.close()
        # return list of tuples (docID, rank)
        return (rr)
示例#2
0
def createScoreFunction(s):
    if (s == 1):
        w = scoring.BM25F(B=0.75, K1=1.5)
        print "BM25"
    else:
        if (s == 2):
            w = scoring.PL2(c=10.0)
            print "PL2"
        else:
            w = scoring.BM25F(B=0.75, K1=1.5)
            print "BM25"
    return w
示例#3
0
def test():
    queries = load_queries()

    ix = index.open_dir(index_dir)
    qp = qparser.QueryParser('content', ix.schema)
    q = qp.parse("id")
    with ix.searcher(weighting=scoring.TF_IDF()) as searcher_tfidf:
        scoring.TFIDF().scorer(searcher_tfidf, 'body',
                               'algebra').score(q.matcher(searcher_tfidf))
    with ix.searcher(weighting=scoring.BM25F()) as searcher_bm25f:
        scoring.BM25F().scorer(searcher_bm25f, 'body',
                               'algebra').score(q.matcher(searcher_bm25f))
示例#4
0
def search(request):
    indexNewsObject = IndexNews()
    ix = indexNewsObject.ix
    if request.method == 'POST':
        inputQuery = request.POST['inputQuerySearchPage']
        request.session['inputQuery'] = inputQuery
        if inputQuery == '':
            context = {
                'message' : 'لطفا عبارت مورد نظر خود را وارد کنید'
            }
            return render(request,'searchPage/searchPage.html',context=context)
        else:
            # queryParser = QueryParser(fieldname='content',schema=ix.schema,group=OrGroup)
            # queryParser = MultifieldParser(['title','content'],schema=ix.schema,group=OrGroup)
            queryParser = MultifieldParser(['title','content','summary'],schema=ix.schema)
            query = queryParser.parse(inputQuery)
            with ix.searcher(weighting=scoring.BM25F()) as searcher:
                results = searcher.search(query,terms=True,limit=None)
                
                #for customize html tag form highlight matched terms 
                htmlFormat = highlight.HtmlFormatter('b')
                results.formatter = htmlFormat
                results.fragmenter.maxchars = 300
                results.fragmenter.surround = 150
                paginator = Paginator(results,15)
                page = request.GET.get('page')
                resultWithPage = paginator.get_page(page)
                context = {
                'results':resultWithPage,
                'inputQuery':inputQuery
                }
                return render(request,'searchPage/searchPage.html',context=context)
    else:
        inputQuery = request.session['inputQuery']
        # queryParser = QueryParser(fieldname='content',schema=ix.schema,group=OrGroup)
        queryParser = MultifieldParser(['title','content','summary'],schema=ix.schema)
        query = queryParser.parse(inputQuery)
        with ix.searcher(weighting=scoring.BM25F()) as searcher:
            results = searcher.search(query,terms=True,limit=None)

            #for customize html tag form highlight matched terms 
            htmlFormat = highlight.HtmlFormatter('b')
            results.formatter = htmlFormat
            results.fragmenter.maxchars = 300
            results.fragmenter.surround = 150
            paginator = Paginator(results,15)
            page = request.GET.get('page')
            resultWithPage = paginator.get_page(page)
            context = {
            'results':resultWithPage,
            'inputQuery':inputQuery
            }
            return render(request,'searchPage/searchPage.html',context=context)
def query(query_phrase):
    if not os.path.exists(INDEX_DIR):
        os.mkdir(INDEX_DIR)

    filter_words = load_all_words()
    word_list = jieba.cut(query_phrase)
    query_phrase = " ".join([w for w in word_list \
        if w not in filter_words and len(w.strip())>0])
    query_phrase = query_phrase.replace("  "," ")
    
    print type(query_phrase),query_phrase

    ix = open_dir(INDEX_DIR)

    with ix.searcher(weighting=scoring.BM25F(B=0.1)) as searcher:

        query = QueryParser("content", ix.schema).parse(query_phrase)
        results = searcher.search(query, limit=150)
        re_json = []
        for e in results:
            score = float(e.score)*float(e["radio"])
            # print e.score,e["radio"]
            m = e.highlights("content").encode('utf8')
            re_json.append((score,e["path"],m))
            # print '*'*20
        print len(re_json)
        rs = sorted(re_json,key=lambda x:x[0],reverse=True)
        res = query_output(rs)
        ix.close()
        
        return res
示例#6
0
def search_over_index(keywords):
    keys = nltk.word_tokenize(keywords)
    manifestos_list = []
    party_mentions_keyword = defaultdict(int)
    top_mentions = defaultdict(int)

    ix = open_dir("indexdir")
    with ix.searcher(weighting=scoring.BM25F()) as searcher:
        for k in keys:
            qp = QueryParser('text', ix.schema, group=OrGroup).parse(k)
            results = searcher.search(qp, limit=None)
            for r in results:
                # List with all the manifestos containing keywords
                if r['id'] not in manifestos_list:
                    manifestos_list.append(r['id'])
                party_mentions_keyword[(r['party'], k)] += 1
                top_mentions[k] += 1

    print("\nAll the manifestos containing the keywords:")
    for manifesto in manifestos_list:
        print(manifesto)

    print("\nHow many times each party mentions each keyword:")
    for key, value in party_mentions_keyword.items():
        print('{0:30}{1:35}{2:30}'.format(key[1], key[0], value))

    manifestos_keywords(manifestos_list)
    list_mentioned_keywords(top_mentions)
示例#7
0
def query_whoosh(whoosh_dir,
                 queries,
                 weighting=scoring.BM25F(),
                 num_results=50):
    res_sets = []
    # Weighting used for ranking documents
    ix = index.open_dir(whoosh_dir)

    # Examine effect of scoring on queries for key terms (and key terms themselves)

    # Highlight search term in results by making them UPPER CASE
    formatter = UppercaseFormatter()

    # Run queries and print results
    for q in queries:  # "new york", "empire state building", "oculus",
        cur = []
        with ix.searcher(weighting=weighting) as searcher:
            query = QueryParser("body", ix.schema).parse(q)
            results = searcher.search(query, limit=num_results)
            results.formatter = formatter
            print_header("Query:   {}   returned {} results for {}".format(
                q, len(results), str(weighting)))
            # if print_results:
            for i, result in enumerate(results):
                cur.append(result['url'].replace('index.txt', ''))
                print_result(i, result)
                print()
        res_sets.append(set(cur))
    return res_sets
def ranking(topic_id, p, index, model="TF-IDF"):
    topic_id = int(topic_id) - 101  # Normalize topic identifier to start at 0
    if model == "TF-IDF":
        weighting = scoring.TF_IDF()
    elif model == "BM25":
        weighting = scoring.BM25F()
    else:
        raise ValueError(
            "Invalid scoring model: please use 'TF-IDF' or 'BM25'")

    with open(os.path.join(corpus_dir, "..", "topics.txt")) as f:
        topics = f.read().split("</top>")[:-1]
    norm_topics = remove_tags(topics)
    topic = norm_topics[topic_id]

    if stemming:
        analyzer = StemmingAnalyzer()
    else:
        analyzer = StandardAnalyzer()

    tokens = [token.text for token in analyzer(topic)]
    string_query = ' '.join(tokens)
    with index.searcher(weighting=weighting) as searcher:
        q = MultifieldParser(
            ("date", "headline", "dateline", "byline", "content"),
            index.schema,
            group=OrGroup).parse(string_query)
        results = searcher.search(q, limit=p)
        return [(r["doc_id"], round(r.score, 4)) for r in results]
示例#9
0
文件: script.py 项目: ruimangas/EADW
def whooshOpen(query):
    ix = open_dir("../lab3/indexdir")

    results_dict = {}

    query = QueryParser('content', ix.schema).parse(query)
    with ix.searcher(weighting=scoring.TF_IDF()) as s_tf:
        tf_results = s_tf.search(query, limit=100)
    for r in tf_results:
        results_dict.setdefault(r.docnum, []).append(r.score)

    with ix.searcher(weighting=scoring.BM25F()) as s_bm:
        bm_results = s_bm.search(query, limit=100)
        for r in bm_results:
            results_dict.setdefault(r.docnum, []).append(r.score)

    ranks = pageRank.pageRank(pageRank.inverted_index, 0.85, 10)

    l = []
    for (id, vals) in results_dict.iteritems():
        if len(vals) == 2:
            l.append((vals[0], vals[1], ranks[id]))

    expected = start()

    ys = []
    for (tf, bm, pr) in l:
        ys.append(bm * expected[0] + tf * expected[1] + pr * expected[2] +
                  expected[3])

    print ys
def query_search(indexdir, queries, n=10, function='BM25F'):
    ix = index.open_dir(indexdir)
    search_fields = ['resname', 'categories', 'address', 'city',
                     'state']  # search fields
    og = qparser.OrGroup.factory(0.9)
    qp = MultifieldParser(search_fields,
                          ix.schema,
                          termclass=query.Variations,
                          group=og)
    qp.add_plugin(DateParserPlugin(free=True))
    q = qp.parse(queries)
    result_index = []
    if function == 'BM25F':
        with ix.searcher(weighting=scoring.BM25F(B=0.75, K1=1.2)) as s:
            rates = sorting.FieldFacet('rating', reverse=True)
            scores = sorting.ScoreFacet()
            results = s.search(q, limit=n, sortedby=[scores, rates])
            k = min(len(results), n)
            for i in range(k):
                result_index.append(int(results[i]['ID']))
    if function == 'TF_IDF':
        with ix.searcher(weighting=scoring.TF_IDF()) as s:
            rates = sorting.FieldFacet('rating', reverse=True)
            scores = sorting.ScoreFacet()
            results = s.search(q, limit=n, sortedby=[scores, rates])
            k = min(len(results), n)
            for i in range(k):
                result_index.append(int(results[i]['ID']))
    return result_index
示例#11
0
    def search(self,
               user_query,
               ranking_function=scoring.BM25F(),
               phraseSearch=False):
        qp = QueryParser("body", schema=self.ix.schema)

        # Once you have a QueryParser object, you can call parse() on it to parse a query string into a query object:
        # default query lang:
        # If the user doesn’t explicitly specify AND or OR clauses:
        # by default, the parser treats the words as if they were connected by AND,
        # meaning all the terms must be present for a document to match
        # we will change this
        # to phrase search "<query>" - use quotes

        qp.add_plugin(qparser.GtLtPlugin)
        # qp.remove_plugin_class(qparser.PhrasePlugin)
        qp.add_plugin(qparser.PhrasePlugin)

        if phraseSearch == True:
            user_query = '"' + user_query + '"'

        query = qp.parse(user_query)
        print("# user_query", user_query, ", Query: ", query)
        print(query)

        with self.ix.searcher(weighting=ranking_function) as searcher:
            matches = searcher.search(query, limit=None)
            print("Total Number of Results:", len(matches))
            print("Number of scored and sorted docs in this Results object:",
                  matches.scored_length())
            results = [item.fields() for item in matches]

        resultsDF = pandas.DataFrame.from_dict(results)
        return (matches.docs(), resultsDF)
示例#12
0
def finalresult():
    if request.method == 'POST':
        #search query
        query = request.form['QA']
        print(query)
        results = []
        ix = index.open_dir("qadata_Index")
        schema = ix.schema
        # Create query parser that looks through designated fields in index
        og = qparser.OrGroup.factory(0.9)
        mp = qparser.MultifieldParser(['question', 'answer'], schema, group=og)
        # This is the user query
        q = mp.parse(request.form['QA'])
        # Actual searcher, prints top 10 hits
        with ix.searcher(weighting=scoring.BM25F()) as s:
            results = s.search(q, limit=5)
            for i in range(5):
                print(results[i]['question'], str(results[i].score),
                      results[i]['answer'])
            return render_template("result.html",
                                   searchquery=request.form['QA'],
                                   Q1=results[0]['question'],
                                   A1=results[0]['answer'],
                                   Q2=results[1]['question'],
                                   A2=results[1]['answer'],
                                   Q3=results[2]['question'],
                                   A3=results[2]['answer'],
                                   Q4=results[3]['question'],
                                   A4=results[3]['answer'],
                                   Q5=results[4]['question'],
                                   A5=results[4]['answer'])
示例#13
0
 def create_searcher(self):
     # 建立搜索对象
     ixr = open_dir(BASE_DIR + "\search_engine\index", 'my_indexing')
     # with ixr.searcher(weighting=scoring.BM25F()) as searcher:
     self.searcher = ixr.searcher(weighting=scoring.BM25F()) # 没有close,可能会内存泄露,记得关服务器。
     # 建立解析器(使用多字段查询解析器)
     self.parser = MultifieldParser(['title', 'text'], schema=self.schema)
示例#14
0
def score_to_file():

    # Open index
    ix = index.open_dir(index_dir)

    # Use the reader to get statistics
    reader = ix.reader()

    queries = load_queries()

    outfile = open(output_file, "w")
    with ix.searcher(weighting=scoring.BM25F()) as searcher:
        # with ix.searcher(weighting=scoring.TF_IDF()) as searcher:
        qp = qparser.QueryParser(field, schema=ix.schema)
        # qp = qparser.MultifieldParser(fields, schema=ix.schema)
        for query in queries:
            print("Processing query number", query['id'])

            # Retrieve documents using the vector space model
            q = qp.parse(query['text'])  # we contatenate query terms
            res = searcher.search(q)
            # res = get_score(searcher, qp, query['text'])
            for r in res:
                outfile.write(query['id'] + " Q0 " + r['id'] + " " +
                              str(r.score) + "\n")
            # Output max 50 results
            # for docnum in sorted(res, key=res.get, reverse=True)[:50]:
            #     # Look up our docID
            #     stored = reader.stored_fields(docnum)
            #     # Write `docID Q0 queryID score` into output file
            #     outfile.write(query['id']+ " Q0 " + stored['id'] + " " + str(res[docnum]) + "\n")
        outfile.close()
    ix.close()
示例#15
0
def ranking(topic_id, p, index, model="TF-IDF"):
    topic_id = int(topic_id) - 101  # Correct topic identifier to start at 0
    if model == "TF-IDF":
        weighting = scoring.TF_IDF()
    elif model == "TF":
        weighting = scoring.Frequency()
    elif model == "BM25":
        weighting = scoring.BM25F()
    else:
        raise ValueError(
            "Invalid scoring model: please use 'TF', 'TF-IDF' or 'BM25'")

    topic = process_topic(topic_id, topic_directory)[1]

    if stemming:
        analyzer = StemmingAnalyzer(stoplist=set(stopwords.words("english")))
    else:
        analyzer = StandardAnalyzer(stoplist=set(stopwords.words("english")))

    tokens = [token.text for token in analyzer(topic)]
    string_query = ' '.join(tokens)
    with index.searcher(weighting=weighting) as searcher:
        q = QueryParser("content", index.schema,
                        group=OrGroup).parse(string_query)
        results = searcher.search(q, limit=p)
        return [(r["doc_id"], round(r.score, 4)) for r in results]
示例#16
0
def conduct_search(query_str):
    '''
    Conducts search over indexed documents using a user-provided query.


    Args:
        query_str (string): The query used for the search


    Returns:
        results_list: A ranked list of 3 tuples containing highest scoring question and answer data
        
    '''

    #number of search results returned to user
    NUM_OF_RESULTS_SHOWN = 3

    #open the index directory
    ix = open_dir("indexdir")

    #conduct index search
    with ix.searcher(weighting=scoring.BM25F()) as searcher:
        query = MultifieldParser(["title", "content"],
                                 ix.schema,
                                 group=OrGroup).parse(query_str)
        results = searcher.search(query,
                                  limit=NUM_OF_RESULTS_SHOWN,
                                  terms=True)
        if NUM_OF_RESULTS_SHOWN < len(results):
            results_list = [(results[num]["title"], results[num]["content"])
                            for num in range(NUM_OF_RESULTS_SHOWN)]
        else:
            results_list = [(results[num]["title"], results[num]["content"])
                            for num in range(len(results))]
        return results_list
示例#17
0
def Search(query, ix):
    with ix.searcher(weighting=scoring.BM25F()) as searcher:
        parser = QueryParser("post", schema=ix.schema, group=OrGroup)
        seg_list = jieba.cut_for_search(query)
        querystring = " ".join(seg_list)
        print(querystring)
        query = parser.parse(querystring)
        results = searcher.search(query, limit=20)
        if len(results) == 0:
            answer = "啊?我不太明白,换一种问法看看?"
        else:
            candidates = []
            for result in results:
                candidate = {
                    'post': result['post'],
                    'reply': result['reply'].replace(" ", ""),
                    'score': result.score,
                    'postemood': result['postemood']
                }
                print(candidate)
                candidates.append(candidate)
            print(len(candidates))
            bestresult = Rank(candidates, querystring)
            answer = bestresult['reply']
            print(answer)
        return answer
示例#18
0
def SearchForTest(querypair, ix):
    with ix.searcher(weighting=scoring.BM25F()) as searcher:
        query = querypair[0]
        emotion = querypair[1]
        parser = QueryParser("post", schema=ix.schema, group=OrGroup)
        seg_list = jieba.cut_for_search(query)
        querystring = " ".join(seg_list)
        print(querystring)
        query = parser.parse(querystring)
        results = searcher.search(query, limit=20)
        if len(results) == 0:
            answer = ""
        else:
            candidates = []
            for result in results:
                candidate = {
                    'post': result['post'],
                    'reply': result['reply'],
                    'score': result.score,
                    'postemood': result['postemood']
                }
                print(candidate)
                candidates.append(candidate)
            print(len(candidates))
            bestresult = RankForTest(candidates, emotion)
            answer = bestresult['reply']
            print(answer)
        return answer
示例#19
0
    def search(self, trans, search_term, page, page_size, boosts):
        """
        Perform the search on the given search_term

        :param search_term: unicode encoded string with the search term(s)

        :returns results: dictionary containing number of hits, hits themselves and matched terms for each
        """
        tool_index_dir = os.path.join(trans.app.config.whoosh_index_dir, 'tools')
        index_exists = whoosh.index.exists_in(tool_index_dir)
        if index_exists:
            index = whoosh.index.open_dir(tool_index_dir)
            try:
                # Some literature about BM25F:
                # http://trec.nist.gov/pubs/trec13/papers/microsoft-cambridge.web.hard.pdf
                # http://en.wikipedia.org/wiki/Okapi_BM25
                # __Basically__ the higher number the bigger weight.
                tool_weighting = scoring.BM25F(field_B={
                                               'name_B' : boosts.tool_name_boost,
                                               'description_B' : boosts.tool_description_boost,
                                               'help_B' : boosts.tool_help_boost,
                                               'repo_owner_username_B' : boosts.tool_repo_owner_username_boost})
                searcher = index.searcher(weighting=tool_weighting)

                parser = MultifieldParser([
                    'name',
                    'description',
                    'help',
                    'repo_owner_username'], schema=schema)

                user_query = parser.parse('*' + search_term + '*')

                try:
                    hits = searcher.search_page(user_query, page, pagelen=page_size, terms=True)
                except ValueError:
                    raise ObjectNotFound('The requested page does not exist.')

                log.debug('searching tools for: #' + str(search_term))
                log.debug('total hits: ' + str(len(hits)))
                log.debug('scored hits: ' + str(hits.scored_length()))
                results = {}
                results['total_results'] = str(len(hits))
                results['page'] = str(page)
                results['page_size'] = str(page_size)
                results['hits'] = []
                for hit in hits:
                    hit_dict = {}
                    hit_dict['id'] = hit.get('id')
                    hit_dict['repo_owner_username'] = hit.get('repo_owner_username')
                    hit_dict['repo_name'] = hit.get('repo_name')
                    hit_dict['name'] = hit.get('name')
                    hit_dict['description'] = hit.get('description')
                    matched_terms = {k: unicodify(v) for k, v in hit.matched_terms()}
                    results['hits'].append({'tool': hit_dict, 'matched_terms': matched_terms, 'score': hit.score})
                return results
            finally:
                searcher.close()
        else:
            raise exceptions.InternalServerError('The search index file is missing.')
示例#20
0
def make_search_service(search_text):
  charmap = charset_table_to_dict(default_charset)
  custom_analyzers = StemmingAnalyzer()

  index_path = join(pathlib.Path(__file__).parent.parent.absolute(), 'indexdir')
  myindex = open_dir(index_path)
  qp = MultifieldParser(["title", "textdata"], schema=myindex.schema, group=AndGroup, fieldboosts={'title': 3.0, 'textdata': 0.8})
  qstring = search_text
  q = qp.parse(qstring)

  results_list = []

  myWeighting= scoring.MultiWeighting(scoring.BM25F(textdata_B=0.5), textdata=scoring.Frequency(), title=scoring.BM25F(title_B=2.0))
  with myindex.searcher(weighting=myWeighting) as s:
    results = s.search(q, limit=30, terms=True)

    #forse cercavi e risultati relativi a
    corrected = s.correct_query(q, qstring)
    did_you_mean = str
    result_for = str
    if corrected.query != q:
      if len(results) < 1:
        results = s.search(qp.parse(corrected.string), limit=30, terms=True)
        result_for = corrected.string
      else:
        did_you_mean = corrected.string


    #query expansion
    keywords = [keyword for keyword, score in results.key_terms("textdata", docs=3, numterms=5)]
    if not keywords and keywords == " ":
      query_keyword = qp.parse(reduce(lambda a, b: a + ' ' + b, keywords))
      results_keyword = s.search(query_keyword, limit=30, terms=True)
      results.upgrade_and_extend(results_keyword)

    #sorting
    key_sort = lambda result: result.score
    results = sorted(results, key=key_sort, reverse=True)

    
    for ris in results:
      result = {}
      result['title'] = ris['title']
      result['url'] = ris['url']
      result['id'] = ris['ID']
      result['highlight'] = ris.highlights("textdata")
      results_list.append(result)


    #per calcolo precisione e recall
    id_results = [ris['id'] for ris in results_list[:10]]

    return {
      'search_text': search_text,
      'results': results_list, 
      'did_you_mean': did_you_mean,
      'result_for': result_for,
      'results_ids': id_results
    }
示例#21
0
def query_evaluator(file_directory,query_tsv_name,result_limit, *notitle):
    '''
    This function is used to evaluate all queries by the 24 different Search Engine.
    :param file_directory: directory where the indexes are stored ("C:./Cranfield_DATASET/" and "C:./Time_DATASET/" )
    :param query_tsv_name: Queries.tsv names (cran_Queries.tsv, time_Queries)
    :param result_limit: the number of top k results retrieved
    :param notitle: if notitle, it means only the body is considered (this is used for Time.csv dataset)
    :return: SEs = that is a dictionary where the keys are the Search Engines (ex. SE_01, SE_02, ..., SE_24) and the values their query document results
    '''


    SEs = defaultdict(list) # dictionary where all the SEs will be stored
    query_path = file_directory+query_tsv_name # query_path = path where there are the queries ["C:./Cranfield_DATASET/cran_Queries.tsv", "C:./Time_DATASET/time_Queries.tsv"]
    
    analyzer_names = ['StemmingAnalyzer', 'StandardAnalyzer', 'RegexAnalyzer', 'SimpleAnalyzer',
                      'FancyAnalyzer', 'NgramAnalyzer', 'KeywordAnalyzer',  'LanguageAnalyzer'] # analyzers names
        
    counter = 1 # counter used to name the SEs
    for analyzer in analyzer_names: 

        index_directory = file_directory+'inverted_index_'+analyzer #get the directory where the index is stored

        ix = index.open_dir(index_directory) # open the index inside the chosen directory
        scoring_functions = [scoring.TF_IDF(),scoring.Frequency(),scoring.BM25F(B=0.75,K1=1.2)] # list of chosen scoring functions

        # per each index three different scoring functions are used:
        for score in scoring_functions:

            scoring_function = score # select the scoring function

            if notitle: #this is fot Time dataset because only the body will be considered
                # query parser
                qp = QueryParser("body", ix.schema) # here we are telling to the search engine in which fields it has to perform the query, if we use multifield we search in more than one field.
            else: # this is for the Cranfield dataset because both title and body will be considered
                # query parser
                qp = MultifieldParser(["title","body"], ix.schema) # here we are telling to the search engine in which fields it has to perform the query, if we use multifield we search in more than one field.


            # Create the searcher for the index based on the predefined scoring function
            searcher = ix.searcher(weighting=scoring_function)

            with open(query_path) as tsvfile: # here the .tsv containing the query is used and one by one are parsed
                querys = csv.reader(tsvfile, delimiter='\t')
                header = next(querys) # check if there is the header 
                if header != None: # if there is the header iterate over all the rows in the Query.tsv file (cran_Queries.tsv, time_Queries)
                    for query in querys:
                        parsed_query = qp.parse(query[1])  # parsing the query (because up to now, the query is just a python string, and it has to be interpreted by the program. Because up to now it's just a boolean operator)
                        results = searcher.search(parsed_query, limit=result_limit) # here the query is performed and only the top "result_limit" will be considered

                        for hit in results:
                            '''
                            here the relevant results will be selected. In particular:
                            Query number, Doc ID, Rank and Score
                            '''
                            output = [query[0],hit['id'], str(hit.rank + 1), str(hit.score)]
                            SEs['SE_'+str(counter)].append(output) # the results are added to the predefined dictionary
            print('analyzer: '+analyzer, 'scoring_function: '+str(scoring_function).split('.')[2].split(' ')[0], '('+str(counter)+')')
            counter +=1
    return(SEs)
示例#22
0
 def generate_score(self, query, measure, k=None):
     '''Generate scores for a given query according to a given measure'''
     if (measure == 'bm25'):
         score = self.rank(query, weighting=scoring.BM25F(), k=k)
     elif (measure == 'cos'):
         score = self.rank(query, weighting=scoring.TF_IDF(), k=k)
     elif (measure == 'freq'):
         score = self.rank(query, weighting=scoring.Frequency(), k=k)
     return score
def person_query_search(indexdir, queries, user_id, E, n=10, function='BM25F'):
    prediction = user_cf(E, user_id, 3)
    ix = index.open_dir(indexdir)
    search_fields = ['resname', 'categories', 'address', 'city',
                     'state']  # search fields
    og = qparser.OrGroup.factory(0.9)
    qp = MultifieldParser(search_fields,
                          ix.schema,
                          termclass=query.Variations,
                          group=og)
    qp.add_plugin(DateParserPlugin(free=True))
    q = qp.parse(queries)
    result_index = []
    if function == 'BM25F':
        # with ix.searcher(weighting=scoring.BM25F(B=0.75, resname_B = 1.0, categories_B = 0.8, K1=1.2)) as s:
        # add weight for the resname and the categories_B
        with ix.searcher(weighting=scoring.BM25F(B=0.75, K1=1.2)) as s:
            scores = sorting.ScoreFacet()
            results = s.search(q, limit=None, sortedby=[scores])
            m = len(results)
            if m != 0:
                relevance = np.zeros(m)
                expected = np.zeros(m)
                for i in range(m):
                    relevance[i] = -results[i].score
                relevance = (relevance - relevance.min()) / (relevance.max() -
                                                             relevance.min())
                # normalized score from 0 to 1
                for i in range(m):
                    expected[i] = relevance[i] * prediction[int(
                        results[i]['ID'])]
                indorder = np.argsort(expected)
                k = min(m, n)
                for i in range(k):
                    result_index.append(int(results[indorder[-1 - i]]['ID']))
    if function == 'TF_IDF':
        with ix.searcher(weighting=scoring.TF_IDF()) as s:
            scores = sorting.ScoreFacet()
            results = s.search(q, limit=m, sortedby=[scores])
            m = len(results)
            if m != 0:
                relevance = np.zeros(m)
                expected = np.zeros(m)
                for i in range(m):
                    relevance[i] = -results[i].score
                relevance = (relevance - relevance.min()) / (
                    relevance.max() - relevance.min()
                )  # normalized score from 0 to 1
                for i in range(m):
                    expected[i] = relevance[i] * prediction[int(
                        results[i]['ID'])]
                indorder = np.argsort(expected)
                k = min(m, n)
                for i in range(k):
                    result_index.append(int(results[indorder[-1 - i]]['ID']))
    return result_index
示例#24
0
def get_scoring(scoring_measure):
    foo = scoring.Frequency()

    if scoring_measure == "TF_IDF":
        foo = scoring.TF_IDF()
    if scoring_measure == "BM_25":
        foo = scoring.BM25F()
    if scoring_measure == "TF":
        foo = scoring.Frequency()
    return foo
示例#25
0
def search(query):
    my_index = index.open_dir("my_index")
    qp = QueryParser("content", schema=my_index.schema)
    result = {}
    q = qp.parse(query)
    with my_index.searcher(weighting=scoring.BM25F()) as s:
        results = s.search(q, terms=True, limit=20)
        for r in results:
            result[r.values()[0]] = round(r.score)
    return result
示例#26
0
def create_scoring_function(s):
	"""
	Returns a scoring function object, depending of the value of s.
	"""
	if s == 2:
		w = scoring.PL2(c=10.0)
	else:
		w = scoring.BM25F(B=0.75, K1=1.5)
	
	return w
示例#27
0
    def eval_get_ranked_set_baseline(self, basefile):
        # Step 1: Read the saved keyterms for a subset of articles
        # (created by analyze_baseline_queries)
        g = Graph()
        g.parse(self.generic_path("keyterms", "analyzed", ".n3"), format="n3")

        articles = {}
        for (s, p, o) in g:
            if not str(s) in articles:
                articles[str(s)] = []
            articles[str(s)].append(str(o))

        # Step 2: Open the large whoosh index containing the text of
        # all cases. Then, create a query for each article based on
        # the keyterms.
        connector = query.Or
        indexdir = os.path.sep.join([self.config.datadir, 'ecj', 'index'])
        storage = FileStorage(indexdir)
        idx = storage.open_index()
        searcher = idx.searcher(weighting=scoring.BM25F())

        res = {}

        # for article in sorted(articles.keys()):
        for article in self._articles(basefile):
            terms = articles[article]
            rankedset = []
            #parser = qparser.QueryParser("content", idx.schema)
            #q = parser.parse(connector.join(terms))
            q = query.And([
                # query.Term("articles", article),
                connector([query.Term("content", x) for x in terms])
            ])
            # print q
            # self.log.debug("Article %s: %s", article, " or ".join(terms))
            results = searcher.search(q, limit=None)
            resultidx = 0
            # self.log.info("Keyterms for result: %r" % results.key_terms("content", docs=10, numterms=10))
            for result in results:
                reslbl = "%s (%s)" % (result['basefile'],
                                      results.score(resultidx))
                rankedset.append([result['basefile'], reslbl])
                # self.log.debug(u"\t%s: %2.2d" % (result['title'], results.score(resultidx)))
                resultidx += 1
            self.log.info(
                "Created baseline ranked set for %s: Top result %s (of %s)" %
                (article.split("/")[-1], rankedset[0][0], len(rankedset)))

            # return just a list of URIs, no scoring information. But the
            # full URI isnt available in the whoosh db, so we recreate it.
            res[article] = [
                "http://lagen.nu/ext/celex/%s" % x[0] for x in rankedset
            ]

        return res
示例#28
0
    def search(self, query):
        ix = open_dir("news")
        if "s" in query.keys():
            s = query["s"][0]
        else:
            s = ""

        if "w" in query.keys():
            if query["w"] == "c":
                w = scoring.TF_IDF()
            elif query["w"] == "b":
                w = scoring.BM25F()
            else:
                w = TimeWeight()
        else:
            w = OurWeight()

        ret = {"r": [], "s": {}}
        with ix.searcher() as searcher:

            parser = MultifieldParser(["t", "d"], ix.schema,
                                      group=OrGroup).parse(unicode(s, "UTF-8"))
            results = searcher.search(parser, limit=100)
            for r in results:

                ret["r"].append({
                    "t": r["t"],
                    "d": r["d"],
                    "p": r["time"],
                    "l": r["link"],
                    "e": r["tags"],
                    "r": r["tags"]
                })

            corrector = searcher.corrector("d")
            for m in s.split():
                sug = corrector.suggest(m, limit=3)
                for s in sug:
                    if m not in ret["s"].keys():
                        ret["s"][m] = []
                    ret["s"][m].append(s)

            print ret["s"]
        f = StringIO()
        f.write(json.dumps(ret, indent=4, separators=(',', ': ')))
        length = f.tell()
        f.seek(0)
        self.send_response(200)
        encoding = sys.getfilesystemencoding()
        self.send_header("Content-type", "text/html; charset=%s" % encoding)
        self.send_header("Content-Length", str(length))
        self.end_headers()
        return f
示例#29
0
def exec_comp():
    '''
    Method that calculates MRR: Mean Reciprocal Rank and saves a table with MRR evaluation for every search engine configuration 
    '''
    #text analyzers
    selected_analyzers = [
        StemmingAnalyzer(),
        SimpleAnalyzer(),
        StandardAnalyzer(),
        RegexAnalyzer(),
        FancyAnalyzer(),
        NgramAnalyzer(5),
        KeywordAnalyzer(),
        LanguageAnalyzer('en')
    ]  #text analyzers
    sel_ana = [
        'StemmingAnalyzer()', 'SimpleAnalyzer()', 'StandardAnalyzer()',
        'RegexAnalyzer()', 'FancyAnalyzer()', 'NgramAnalyzer(5)',
        'KeywordAnalyzer()', 'LanguageAnalyzer()'
    ]  #text which will be used for graph and for mrr table

    i = 0  #counter
    mrrs = []  #list where MRR values for each SE configuration will be stored

    #scoring functions
    scoring_functions = [
        scoring.TF_IDF(),
        scoring.Frequency(),
        scoring.BM25F(B=0.75, content_B=1.0, K1=1.5)
    ]
    scor_func = [' TF_IDF', ' Frequency', ' BM25F']

    #ground truth
    gt1 = pd.read_csv(os.getcwd() +
                      "/part_1/Cranfield_DATASET/cran_Ground_Truth.tsv",
                      sep='\t')

    #combinations for every chosen analyzer with every chosen scoring function
    for x in range(len(selected_analyzers)):
        for y in range(len(scoring_functions)):
            print(sel_ana[x] + scor_func[y])
            i = i + 1
            sr_1 = exec_queries(
                selected_analyzers[x], scoring_functions[y]
            )  # execute queries for the chosen configuration combination
            sr_1.to_csv(os.getcwd() + "/part_1/" + str(i) + "__.csv",
                        index=False)  #save results of the search engine
            mrrs.append((sel_ana[x] + scor_func[y], mrr(gt1,
                                                        sr_1)))  #calculate MRR
    mrrs_saving = pd.DataFrame(mrrs)
    mrrs_saving.to_csv(os.getcwd() + "/part_1/mrrs.csv",
                       index=False)  #store MRR table
示例#30
0
def search(humanReadableId):
    query = request.args.get('q', '').strip()
    pagination = None
    if query:
        index_base_dir = config().get_path("ZIM", "wikipedia_index_dir")
        index_dir = os.path.join(index_base_dir, humanReadableId)
        page = int(request.args.get('page', 1))

        # Load index so we can query it for which fields exist
        ix = whoosh_open_dir_32_or_64(index_dir)

        # Set a higher value for the title field so it is weighted more
        weighting = scoring.BM25F(title_B=1.0)

        # Sort pages with "Image:" in their title after
        # regular articles
        def image_pages_last(searcher, docnum):
            fields = searcher.stored_fields(docnum)
            if fields['title'].find("Image:") == 0:
                return 1
            else:
                return 0

        # Support older whoosh indexes that do not have a reverse_links field
        if 'reverse_links' in ix.schema.names():
            sortedby = sorting.MultiFacet([
                sorting.FunctionFacet(image_pages_last),
                sorting.ScoreFacet(),
                sorting.FieldFacet("reverse_links", reverse=True),
            ])
        else:
            sortedby = sorting.MultiFacet([
                sorting.FunctionFacet(image_pages_last),
                sorting.ScoreFacet(),
            ])

        (pagination, suggestion) = paginated_search(ix, ["title", "content"],
                                                    query,
                                                    page,
                                                    weighting=weighting,
                                                    sort_column=sortedby)
    else:
        flash(_('Please input keyword(s)'), 'error')

    return render_template('zim/search.html',
                           humanReadableId=humanReadableId,
                           pagination=pagination,
                           suggestion=suggestion,
                           keywords=query,
                           endpoint_desc=EndPointDescription(
                               'zim_views.search',
                               {'humanReadableId': humanReadableId}))