Пример #1
0
def main():
    claim_text_d: Dict[int, str] = get_all_claim_d()
    claim_text_d: Dict[str, str] = dict_key_map(str, claim_text_d)
    evi_dict: Dict[str, str] = dict_key_map(str, load_evidence_dict())
    evi_gold_dict: Dict[str, List[int]] = evidence_gold_dict_str_qid()
    print("V2")

    def print_entry(entry):
        evidence_text = evi_dict[entry.doc_id]
        print("[{}] {}: {}".format(entry.rank, entry.doc_id, evidence_text))

    ranked_list_dict = load_ranked_list_grouped(sys.argv[1])
    for query, ranked_list in ranked_list_dict.items():
        print()

        claim_id, perspective_id = query.split("_")
        gold_ids: List[str] = lmap(str, evi_gold_dict[query])
        if not gold_ids:
            print("query {} has no gold".format(query))
            continue
        assert gold_ids
        claim_text = claim_text_d[claim_id]
        perspective_text = perspective_getter(int(perspective_id))

        pos_entries = []
        neg_entries = []
        for entry in ranked_list:
            label = entry.doc_id in gold_ids
            if label:
                pos_entries.append(entry)
            elif entry.rank < 3:
                neg_entries.append(entry)

        if not pos_entries:
            print("gold not in ranked list")
            continue

        num_rel = len(pos_entries)

        correctness = []
        for entry in ranked_list[:num_rel]:
            label = entry.doc_id in gold_ids
            correctness.append(int(label))

        precision = average(correctness)
        if precision > 0.99:
            print("Good")
            continue
        print("precision at {}: {}".format(num_rel, precision))

        print("Claim: ", claim_text)
        print("perspective_text: ", perspective_text)
        print(" < GOLD >")
        foreach(print_entry, pos_entries)
        print(" < False Positive >")
        foreach(print_entry, neg_entries)
Пример #2
0
def get_idf_keyword_score(problems: List[QueryDoc],
                          get_idf) -> Iterable[Counter]:
    stemmer = CacheStemmer()
    ticker = TimeEstimator(len(problems))
    for p in problems:
        tokens = p.doc
        tf = Counter()
        reverse_map = {}  # Stemmed -> raw
        tokens = [t for t in tokens if t not in [".", ",", "!"]]
        for raw_t in tokens:
            stem_t = stemmer.stem(raw_t)
            reverse_map[stem_t] = raw_t
            tf[stem_t] += 1

        score_d = Counter()
        for term, cnt in tf.items():

            score = math.log(1 + cnt) * get_idf(term)
            assert type(score) == float
            score_d[term] = score

        score_d_surface_form: Counter = Counter(
            dict_key_map(lambda x: reverse_map[x], score_d))
        ticker.tick()
        yield score_d_surface_form
Пример #3
0
def run_random_walk_score_with_weight():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 7
    q_tf_replace = dict(load_from_pickle("random_walk_score_100"))
    q_tf_replace = dict_key_map(lambda x: int(x), q_tf_replace)
    bm25 = get_bm25_module()
    pred = pc_predict_vector_query_and_reweight(bm25, q_tf_replace, claims,
                                                top_k, {'k1': 0.5})
    print(evaluate(pred))
Пример #4
0
def run_random_walk_score():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 7
    q_tf_replace = dict(load_from_pickle("random_walk_score_100"))
    q_tf_replace = dict_key_map(lambda x: int(x), q_tf_replace)
    #q_tf_replace = dict(load_from_pickle("pc_dev_par_tf"))
    #q_tf_replace = dict(load_from_pickle("bias_random_walk_dev_plus_all"))
    bm25 = get_bm25_module()
    pred = pc_predict_from_vector_query(bm25, q_tf_replace, claims, top_k)
    print(evaluate(pred))
Пример #5
0
def main():
    config = load_run_config()
    dvp_list = pickle.load(open(config['dvp_path'], "rb"))
    dvp_to_correctness_dict = dvp_to_correctness(dvp_list, config)

    def convert_key(key: Tuple[str, Tuple[str, int]]):
        qid, (doc_id, doc_idx) = key
        return qid, "{}_{}".format(doc_id, doc_idx)

    save_dict = dict_key_map(convert_key, dvp_to_correctness_dict)

    pickle.dump(save_dict, open(config['save_path'], "wb"))
Пример #6
0
def run_random_walk_score():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 20
    bm25 = get_bm25_module()
    #df, N = get_idf()
    #bm25.df = df
    #bm25.N = N
    q_tf_replace_0 = dict(load_from_pickle("random_walk_score_100"))
    q_tf_replace = dict(load_from_pickle("dev_claim_random_walk_debug2"))
    q_tf_replace = dict_key_map(lambda x: int(x), q_tf_replace)
    pc_predict_to_inspect(bm25, q_tf_replace, q_tf_replace_0, claims, top_k)
Пример #7
0
def get_extended_eval_candidate_as_qck_raw(
        split) -> Dict[str, List[QCKCandidate]]:
    c: Dict[int, List[int]] = get_extended_eval_candidate(split)

    def convert_candidates(candidates: List[int]) -> List[QCKCandidate]:
        p_texts = lmap(perspective_getter, candidates)
        l: List[QCKCandidate] = []
        for pid, text in zip(candidates, p_texts):
            l.append(QCKCandidate(str(pid), text))
        return l

    c2: Dict[int, List[QCKCandidate]] = dict_value_map(convert_candidates, c)
    return dict_key_map(str, c2)
def make_qcknc_problem(
    passage_score_path: FilePath,
    info_path: FilePath,
    config_path: FilePath,
    split: str,
    save_name: str,
) -> None:
    candidate_dict: Dict[int, List[Dict]] = dict(
        get_eval_candidates_from_pickle(split))
    queries: List[QCKQuery] = get_qck_queries(split)

    config = json.load(open(config_path, "r"))

    def get_pids(l: List[Dict]) -> List[str]:
        return lmap(lambda x: x['pid'], l)

    candidate_id_dict_1: Dict[int, List[str]] = dict_value_map(
        get_pids, candidate_dict)
    candidate_id_dict: Dict[str,
                            List[str]] = dict_key_map(str, candidate_id_dict_1)

    all_candidate_ids = set(flatten(candidate_id_dict.values()))
    candidate_dict: Dict[str, QCKCandidate] = {
        cid: get_qck_candidate_from_candidate_id(cid)
        for cid in all_candidate_ids
    }

    data_id_to_info: Dict = load_combine_info_jsons(info_path, qk_convert_map)
    print("number of dat info ", len(data_id_to_info))
    qk_result: List[Tuple[str, List[QKOutEntry]]] = collect_good_passages(
        data_id_to_info, passage_score_path, config)

    query_dict = {q.query_id: q for q in queries}
    payloads = qck_from_qk_results(qk_result, candidate_id_dict, query_dict,
                                   candidate_dict)

    out_dir = os.path.join(output_path, "cppnc")
    exist_or_mkdir(out_dir)
    save_path = os.path.join(out_dir, save_name + ".tfrecord")
    data_id_man = write_qck_as_tfrecord(save_path, payloads)
    info_save_path = os.path.join(out_dir, save_name + ".info")
    print("Payload size : ", len(data_id_man.id_to_info))

    json.dump(data_id_man.id_to_info, open(info_save_path, "w"))
    print("tfrecord saved at :", save_path)
    print("info saved at :", info_save_path)