def check_engine_quality(self, query_num, list_of_docs):
        """
        :param query_num:
        :param list_of_docs:
        :return: no return. prints metrics of the query. precision, recall, map.
        """

        benchmark_path = "data\\benchmark_lbls_train.csv"
        df = pd.read_csv(benchmark_path)

        df_prec = df[df['query'] == query_num]
        df_prec = df_prec[df_prec['tweet'].isin(list_of_docs)]
        dict_for_data = df_prec.set_index('tweet')['y_true'].to_dict()

        rmv_lst = []

        ranking = []
        # Add to list for rank
        for doc in list_of_docs:
            try:
                ranking.append(dict_for_data[int(doc)])
            except:
                rmv_lst.append(doc)
        for d in rmv_lst:
            list_of_docs.remove(d)

        data_df = pd.DataFrame({
            'query': query_num,
            'tweet': list_of_docs,
            'y_true': ranking
        })

        df_rec = df[df['query'] == query_num]
        recall_total = len(df_rec[df_rec['y_true'] == 1.0])

        # print("total Relevant doc found with tag 1 :" , len (data_df[data_df['y_true'] == 1.0]))
        # print("total NON relevant doc found with tag 0 :" , len (data_df[data_df['y_true'] == 0]))
        # print("found total of", len(df_prec), "tagged docs")
        # Calculate and print
        prec5 = metrics.precision_at_n(data_df, query_num, 5)
        prec10 = metrics.precision_at_n(data_df, query_num, 10)
        prec50 = metrics.precision_at_n(data_df, query_num, 50)
        prec_total = metrics.precision(data_df, True, query_number=query_num)
        map_of_query = metrics.map(data_df)
        recall_val = metrics.recall_single(data_df, recall_total, query_num)
        self.map_list.append(map_of_query)
        self.prec5_list.append(prec5)
        self.prec10_list.append(prec10)
        self.prec50_list.append(prec50)
        self.prec_total_list.append(prec_total)
        self.recall_list.append(recall_val)

        print()
        print("precision at 5 of query", query_num, "is :", prec5)
        print("precision at 10 of query", query_num, "is :", prec10)
        print("precision at 50 of query", query_num, "is :", prec50)
        print("precision of query", query_num, "is :", prec_total)
        print("recall of query", query_num, "is :", recall_val)
        print("map of query", query_num, "is :", map_of_query)
Exemplo n.º 2
0
     'precision@5': [],
     'precision@10': [],
     'precision@50': [],
     'recall': []
 }
 for query_num in range(1, 36):
     dict_data['query'].append(query_num)
     dict_data['recall'].append(
         metrics.recall_single(q_results_labeled,
                               q2n_relevant.get(query_num),
                               query_num))
     dict_data['precision'].append(
         metrics.precision(q_results_labeled, True,
                           query_num))
     dict_data['precision@5'].append(
         metrics.precision_at_n(q_results_labeled,
                                query_num, 5))
     dict_data['precision@10'].append(
         metrics.precision_at_n(q_results_labeled,
                                query_num, 10))
     dict_data['precision@50'].append(
         metrics.precision_at_n(q_results_labeled,
                                query_num, 50))
 df_data = pd.DataFrame(dict_data,
                        columns=[
                            'query', 'precision',
                            'precision@5', 'precision@10',
                            'precision@50', 'recall'
                        ])
 # print(df_data)
 df_data.to_excel(engine_module + "_output.xlsx")
 # test that the average across queries of precision,