Пример #1
0
def query():
    ''' the main query processing program, using QueryProcessor'''

    II = index.InvertedIndex()
    index_file = sys.argv[1]
    index_file = II.load(index_file)

    proc_alg = sys.argv[2]
    proc_alg = proc_alg

    q_text = sys.argv[3]
    q_text = q_text

    qid = sys.argv[4]
    qid = qid

    qrys = cranqry.loadCranQry(q_text)  #qrys is a dict
    #qrys =  cranqry.loadCranQry('../CranfieldDataset/query.text')  #can also be hard-coded like this one

    #qid = '069'   #example of hard-coding a query id
    qp = QueryProcessor(
        qrys[qid].text, index_file,
        'cran.all')  #qid, and index_file are to be passed by the user

    #print qp.booleanQuery()
    if proc_alg == '0':
        qp.booleanQuery()
        print qp.booleanQuery()
    elif proc_alg == '1':
        qp.vectorQuery(3)  #returning top 3 ranked results for the vector model
        print qp.vectorQuery(3)
Пример #2
0
def query(index_file, processing_algorithm, query_file, query_id):
    """ the main query processing program, using QueryProcessor"""
    cranqryobj = cranqry.loadCranQry(query_file)
    dict_query = {}
    for q in cranqryobj:
        dict_query[q] = cranqryobj[q].text
    query_txt = dict_query[query_id]
    indexObject = index.InvertedIndex()
    items = indexObject.load(index_file)
    QPobj = QueryProcessor(query_txt, items, index_file)
    QPobj.preprocessing()
    doc_ids = []
    if processing_algorithm == "0":  # boolean Query
        doc_ids = QPobj.booleanQuery()
    elif processing_algorithm == "1":  # vector Query
        doc_ids = QPobj.vectorQuery(3)  # first 3 documents based on ranking
    else:
        print("Invalid Processing algorithm")
    print(doc_ids)
    return doc_ids
Пример #3
0
queries on your search engine. In order to let everything go
the best, you have to be sure that the engine will work on
pre-computed indeces. Thus, forget to allow the main file to
build the index from scratch. When the user executes the file
it should be able to choose:
+ search_engine: a parameter that the user set to choose the
  search engine to run. According to the request of the
  homework, you can get 1,2 or 3.
+ Any other parameters you would like.
"""

import utils
import index
import index_utils

inverted_index = index.InvertedIndex(index.idx)


def menu():
    choice = 0
    print(
        "1 - Search without score\n2 - Search with score\n3- Search with 'new' score!"
    )
    while choice != 1 and choice != 2 and choice != 3:
        choice = int(input("Number (1, 2, 3): "))

    result = 'None'
    search_term = input("Enter term(s) to search: ")
    if choice == 1:
        result = inverted_index.lookup_conjunctive_query(
            index_utils.format_text(search_term))
Пример #4
0
def to_ndcg(qrels, q_text, idx_file, tk=10, n=2):
    column_names = ['qid', 'docid', 'bool_rel', 'vec_rel'
                    ]  #for creating a dataframe for easier data manupilation
    #df_qrels = pd.read_csv('../CranfieldDataset/qrels.text', names=column_names, sep=' ')   #can test by hard-coding
    df_qrels = pd.read_csv('../CranfieldDataset/qrels.sample',
                           names=column_names,
                           sep=' ')  #can test by hard-coding
    #df_qrels = pd.read_csv(qrels, names=column_names, sep=' ')
    #print df_qrels

    unique_qids = list(set(list(df_qrels.qid.values)))
    random.shuffle(unique_qids)
    random_qids = unique_qids[0:n]

    qrys = cranqry.loadCranQry('../CranfieldDataset/query.text'
                               )  #qrys is a dict---for hard-coded testing
    #qrys =  cranqry.loadCranQry(q_text)  #qrys is a dict

    qrys_ids = [key for key, val in qrys.iteritems()]

    II = index.InvertedIndex()
    index_file = II.load("index_file.json")  #for hard-coded testing
    #index_file = II.load(idx_file)

    vec_agg_ndcg, bool_agg_ndcg = list(), list(
    )  #for storing aggregate ndcg scores
    for qid in random_qids:
        print qid
        df_qid = df_qrels[
            df_qrels["qid"] ==
            qid]  #dataframe for one query id---comparison of an integer qid in a string qid

        qid_docids = list(
            df_qid['docid']
        )  #list of doc ids for a randomly chosen query id from qrels.text---to be used for ndcg_score
        print qid_docids

        st_qid = str(
            qid
        )  #very important----the decimal number in random_qids should be matched the octal numbers in the cranfield dataset

        if len(st_qid) == 1:  #for handing decimal to octal qid conversion
            st_qid = "00" + st_qid
        elif len(st_qid) == 2:
            st_qid = "0" + st_qid
        else:
            st_qid = st_qid

        if st_qid in qrys_ids:
            qp = QueryProcessor(qrys[st_qid].text, index_file, 'cran.all')

            bool_array = qp.booleanQuery()
            vec_array = qp.vectorQuery(10)  #change back to 'tk'
            print bool_array
            bool_array = [int(v) for v in bool_array]
            print bool_array
            #ndcg for boolean model
            bool_list = [(0, 0)] * 10  #change back to tk

            idx = 0
            for doc_id in bool_array:
                if doc_id in qid_docids:  #iteratively check if a docid returned by the vector model is present in qrels.text for the specific query(qid)
                    #y_true[idx] = 1
                    bool_list[idx] = (1, 1)
                    idx += 1
                else:
                    bool_list[idx] = (0, 1)
                if idx == 10:
                    break
            #print bool_list

            y_true = [int(bool_id[0]) for bool_id in bool_list]
            y_score = [int(bool_id[1]) for bool_id in bool_list]
            print "bool", y_true
            print "bool", y_score

            bool_agg_ndcg.append(metrics.ndcg_score(y_true, y_score, 10))

            #ndcg for vector model
            print vec_array
            y_score = [
                vec_id[1] for vec_id in vec_array
            ]  #y_score--to be passed to ndcg_score is the list of cosine similarity scores
            vec_ids = [
                int(vec_id[0]) for vec_id in vec_array
            ]  #list of docids from the list of tuples of the form (docid, similarity_score)
            #print vec_ids
            y_true = [0] * 10  ##added on 0317---change back to tk
            idx = 0
            for doc_id in vec_ids:
                if doc_id in qid_docids:  #iteratively check if a docid returned by the vector model is present in qrels.text for the specific query(qid)
                    y_true[idx] = 1
                    idx += 1
            print "vec", y_true
            print "vec", y_score
            vec_agg_ndcg.append(metrics.ndcg_score(y_true, y_score, 10))

            del qp  ##garbage collection

    return bool_agg_ndcg, vec_agg_ndcg
def eval(index_file, query_text, qrels, n):
    qrys = cranqry.loadCranQry(query_text)
    queries = {}
    for q in qrys:
        queries[q] = qrys[q].text
    query_ids = list(queries.keys())
    query_ids.sort()
    query_ids_ints = []
    for k in range(0, len(query_ids)):  # generating n random queries
        query_ids_ints.append(int(query_ids[k]))
    set1 = set()
    while len(set1) != n:
        set1.add(random.choice(query_ids_ints))
    selected_queries = list(set1)
    docs = set()
    qrels = {}

    f = open("qrels.text", "r")  # parsing relevant queries(qrels.text)
    l = f.readline()
    while l:
        j = l.split(" ")
        if query_ids_ints[int(j[0]) - 1] in qrels.keys():
            qrels[query_ids_ints[int(j[0]) - 1]].append(int(j[1]))
        else:
            qrels[query_ids_ints[int(j[0]) - 1]] = [int(j[1])]
        l = f.readline()
    cranqryobj = cranqry.loadCranQry(query_text)
    dict_query = {}
    for q in cranqryobj:
        dict_query[int(q)] = cranqryobj[
            q].text  # matching queries in query.text and qrels.text
    indexObject = index.InvertedIndex()
    items = indexObject.load(index_file)
    vector_ndcg_score = {}
    vector_score_dict = {}
    for q in selected_queries:
        print(q)
        query_raw = dict_query[q]
        QPobj = QueryProcessor(query_raw, items, index_file)
        QPobj.preprocessing()
        result_list = QPobj.vectorQuery(
            10)  # fetching first 10 documents for a query using vector model
        boolean_result_list = QPobj.booleanQuery()
        print("Boolean query result : ", boolean_result_list
              )  # fetching documents for a query using booleanQuery
        ndcg_boolean = 0
        truth_list = qrels[q]
        boolean_output_list = []
        rank_doc_list = list(map(lambda x: int(x[0]), result_list))
        print("Relavant documents for this query : ",
              truth_list)  # relavant documents for the query
        print("Vector model result : ",
              rank_doc_list)  # documents result list for vector model
        vector_score_list = []
        for id in boolean_result_list:  # calculating the predicted scores for boolean model
            if int(id) in truth_list:
                boolean_output_list.append(1)
            else:
                boolean_output_list.append(0)
        boolean_score_list = []
        if len(boolean_score_list) < 10:
            boolean_score_list = boolean_output_list
            while len(boolean_score_list) != 10:
                boolean_score_list.append(0)
        elif len(boolean_score_list) > 10:
            for i in range(0, 10):
                boolean_score_list[i] = boolean_output_list[i]
        for id in rank_doc_list:  # calculating the predicted scores for vector model

            if id in truth_list:
                vector_score_list.append(1)
            else:
                vector_score_list.append(0)
        vector_score_dict[q] = vector_score_list
        truth_score_list = []
        for i in range(
                0, len(vector_score_list)
        ):  # calculating the ground_truth scores for vector model
            truth_score_list.append(vector_score_list[i])
        truth_score_list.sort(reverse=True)

        boolean_truth_score_list = []
        for i in range(
                0, len(boolean_score_list)
        ):  # calculating the ground_truth scores for boolean model
            boolean_truth_score_list.append(boolean_score_list[i])
        boolean_truth_score_list.sort(reverse=True)
        print("Vector model ground_truth list is:\n", truth_score_list)
        print("Vector ranking score list is:\n", vector_score_list)
        print("Boolean model ground_truth list is:\n",
              boolean_truth_score_list)
        print("Boolean model score list is:\n", boolean_score_list)
        vector_ndcg_score[q] = [
            ndcg_score(np.array(boolean_truth_score_list),
                       np.array(boolean_score_list)),
            ndcg_score(np.array(truth_score_list), np.array(vector_score_list))
        ]
    vector_list = [
    ]  # compute ndcg score for boolean and vector models for all the randomly generated queries
    boolean_list = []
    for qu in vector_ndcg_score:
        vector_list.append(vector_ndcg_score[qu][1])
        boolean_list.append(vector_ndcg_score[qu][0])

    print("ndcg score of boolean and vector models for all the queries:\n",
          vector_ndcg_score)
    print("ndcg scores list for boolean model for all the queries:\n",
          boolean_list)
    print("ndcg scores list for vector model for all the queries:\n",
          vector_list)
    p_value_wilcoxon = stats.wilcoxon(
        np.array(boolean_list), np.array(vector_list)
    )  # calculating p value using wilcoxon test and ttest  for boolean and vector models  p_value_ttest=stats.ttest_ind(np.array(boolean_list),np.array(vector_list), equal_var = False)
    print("wilcoxon test p value is:", p_value_wilcoxon[1])
    print("ttest p value is :", p_value_ttest[1])