예제 #1
0
def load_dataset_by_split(split: str) -> Dict[str, List[Dict]]:
    all_data: Dict[str, List[Dict]] = l_to_map(ukp.load, all_topics)

    # split train / dev
    def is_train(entry: Dict) -> bool:
        return entry['set'] == 'train'

    def is_validation(entry: Dict) -> bool:
        return entry['set'] == 'val'

    def filter_train(data: List[Dict]) -> List[Dict]:
        return lfilter(is_train, data)

    def filter_validation(data: List[Dict]) -> List[Dict]:
        return lfilter(is_validation, data)

    if split == "train":
        train_data: Dict[str,
                         List[Dict]] = dict_value_map(filter_train, all_data)
        return train_data
    elif split == "dev":
        val_data: Dict[str,
                       List[Dict]] = dict_value_map(filter_validation,
                                                    all_data)
        return val_data
    else:
        assert False
예제 #2
0
def predict_by_bm25_rm(bm25_module: BM25, rm_info: Dict[str, List[Tuple[str,
                                                                        str]]],
                       claims, top_k) -> List[Tuple[str, List[Dict]]]:

    cid_to_text: Dict[int, str] = claims_to_dict(claims)
    tokenizer = PCTokenizer()

    def stem_merge(score_list: List[Tuple[str, float]]) -> Counter:
        c = Counter()
        for k, v in score_list:
            try:
                new_k = tokenizer.stemmer.stem(k)
                c[new_k] += v
            except UnicodeDecodeError:
                pass
        return c

    rm_info: Dict[str,
                  List[Tuple[str,
                             float]]] = dict_value_map(parse_float, rm_info)
    rm_info: Dict[str,
                  List[Tuple[str,
                             float]]] = dict_value_map(normalize_scores,
                                                       rm_info)
    rm_info_c: Dict[str, Counter] = dict_value_map(stem_merge, rm_info)
    print(len(rm_info_c.keys()))
    print(len(claims))
    not_found = set()

    def scorer(lucene_score, query_id) -> NamedNumber:
        claim_id, p_id = query_id.split("_")
        c_text = cid_to_text[int(claim_id)]
        p_text = perspective_getter(int(p_id))
        score: NamedNumber = bm25_module.score(c_text, p_text)

        nclaim_id = int(claim_id)
        if nclaim_id in rm_info:
            ex_qtf = rm_info_c[nclaim_id]
            p_tokens = tokenizer.tokenize_stem(p_text)
            ex_score = bm25_module.score_inner(ex_qtf, Counter(p_tokens))
            new_info = score.name + "({})".format(ex_score.name)
            score = NamedNumber(score + ex_score, new_info)
        else:
            not_found.add(claim_id)
        return score

    r = predict_interface(claims, top_k, scorer)
    print(not_found)
    return r
예제 #3
0
파일: qrel.py 프로젝트: clover3/Chair
def load_clef_qrels() -> Dict[str, List[str]]:
    path1 = os.path.join(data_path, "CLEFeHealth2017IRtask", "assessments", "2017", "clef2017_qrels.txt")
    q_rel_d1 = load_qrels_flat(path1)
    path2 = os.path.join(data_path, "CLEFeHealth2017IRtask", "assessments", "2016", "task1.qrels")
    q_rel_d2 = load_qrels_flat(path2)

    def fn(pair_list):
        return list([doc_id for doc_id, score in pair_list if score > 0])
    q_rel_1 = dict_value_map(fn, q_rel_d1)
    q_rel_2 = dict_value_map(fn, q_rel_d2)

    for key in q_rel_2:
        q_rel_1[key].extend(q_rel_2[key])

    return q_rel_1
예제 #4
0
def group_by_qid_cid(
        predictions: List[Dict]) -> Dict[str, Dict[str, List[Dict]]]:
    grouped: Dict[str, List[Dict]] = group_by(predictions,
                                              lambda x: x['query'].query_id)
    grouped2: Dict[str, Dict[str, List[Dict]]] = \
        dict_value_map(lambda x: group_by(x, lambda x: x['candidate'].id), grouped)
    return grouped2
예제 #5
0
파일: pc_predict.py 프로젝트: clover3/Chair
def pc_predict_from_vector_query(bm25_module: BM25,
                                 q_tf_replace: Dict[int, Counter], claims,
                                 top_k) -> List[Tuple[str, List[Dict]]]:

    cid_to_text: Dict[int, str] = claims_to_dict(claims)
    found_claim = set()
    q_tf_replace_norm = dict_value_map(normalize_counter, q_tf_replace)

    c_qtf_d = {}
    for cid, c_text in cid_to_text.items():
        c_tokens = bm25_module.tokenizer.tokenize_stem(c_text)
        c_qtf_d[cid] = Counter(c_tokens)

    def scorer(lucene_score, query_id) -> NamedNumber:
        nonlocal found_claim
        claim_id, p_id = query_id.split("_")
        i_claim_id = int(claim_id)
        if i_claim_id in q_tf_replace_norm:
            claim_qtf = Counter(
                dict_value_map(lambda x: x * 1, c_qtf_d[i_claim_id]))
            ex_qtf = q_tf_replace_norm[i_claim_id]
            ex_qtf = Counter(dict(ex_qtf.most_common(50)))
            qtf = ex_qtf + claim_qtf
            found_claim.add(i_claim_id)
        else:
            qtf = c_qtf_d[i_claim_id]
        p_text = perspective_getter(int(p_id))
        p_tokens = bm25_module.tokenizer.tokenize_stem(p_text)
        score = bm25_module.score_inner(qtf, Counter(p_tokens))
        return score

    r = predict_interface(claims, top_k, scorer)
    print("{} of {} found".format(len(found_claim), len(claims)))
    return r
예제 #6
0
def save_concat_dev():
    #    prediction_path = pjoin(output_path, "pc_long_seq11")
    prediction_path = pjoin(output_path, "pc_long_focus_1")
    scores: Dict[CPID, List[float]] = collect_pipeline2_score(
        prediction_path, "pc_rel_dev_info_all")
    reduced_score: Dict[CPID, float] = dict_value_map(sum, scores)
    save_to_pickle(reduced_score, "pc_concat_dev_score")
예제 #7
0
def save_ranked_list(prediction_path, meta_info, save_path):

    data = EstimatorPredictionViewer(prediction_path)

    q_dict = {}
    for entry in data:
        data_id = entry.get_vector('data_id')[0]
        scores = entry.get_vector('logits')
        q_id, doc_id = meta_info[data_id]

        if q_id not in q_dict:
            q_dict[q_id] = []

        probs = softmax(scores)
        q_dict[q_id].append((doc_id, probs[1]))

    def add_rank(
            ranked_list: List[Tuple[str,
                                    float]]) -> List[Tuple[str, int, float]]:
        ranked_list.sort(key=lambda x: x[1], reverse=True)
        ranked_list = [(doc_id, rank, score)
                       for rank, (doc_id, score) in enumerate(ranked_list)]
        return ranked_list

    q_dict_new = dict_value_map(add_rank, q_dict)
    write_ranked_list_from_d(q_dict_new, save_path)
예제 #8
0
def main():
    file_path = sys.argv[1]
    top_n = int(sys.argv[2])
    save_path = sys.argv[3]
    ranked_list_d: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(file_path)

    def get_head(l: List):
        return l[:top_n]

    new_ranked_list = dict_value_map(get_head, ranked_list_d)
    write_ranked_list_from_s(new_ranked_list, save_path)
예제 #9
0
 def __init__(self,
              candidates_dict: Dict[str, List[QCKCandidateI]],
              is_correct_fn,
              rel_ranked_list: Dict[str, List[TrecRankedListEntry]],
              kdp_as_sub_token=False):
     self.max_seq_length = 512
     self.tokenizer = get_tokenizer()
     self.candidates_dict: Dict[str, List[QCKCandidateI]] = candidates_dict
     self._is_correct = is_correct_fn
     self.kdp_as_sub_token = kdp_as_sub_token
     self.kdp_score_d: Dict[str, Dict[str, float]] = dict_value_map(
         get_d_from_ranked_list, rel_ranked_list)
예제 #10
0
def load_all_data(
) -> Tuple[Dict[str, List[UkpDataPoint]], Dict[str, List[UkpDataPoint]]]:
    all_data: Dict[str, List[Dict]] = l_to_map(ukp.load, all_topics)

    # split train / dev
    def is_train(entry: Dict) -> bool:
        return entry['set'] == 'train'

    def is_validation(entry: Dict) -> bool:
        return entry['set'] == 'val'

    def filter_train(data: List[Dict]) -> List[Dict]:
        return lfilter(is_train, data)

    def filter_validation(data: List[Dict]) -> List[Dict]:
        return lfilter(is_validation, data)

    raw_train_data: Dict[str,
                         List[Dict]] = dict_value_map(filter_train, all_data)
    raw_val_data: Dict[str,
                       List[Dict]] = dict_value_map(filter_validation,
                                                    all_data)

    def all_data_iterator() -> Iterator[Dict]:
        for data_list in chain(raw_train_data.values(), raw_val_data.values()):
            for dp in data_list:
                yield dp

    dp_id = 1
    for dp in all_data_iterator():
        dp['dp_id'] = dp_id
        dp_id += 1

    def to_data_point(l: List[Dict]) -> List[UkpDataPoint]:
        return lmap(UkpDataPoint.from_dict, l)

    train_data = dict_value_map(to_data_point, raw_train_data)
    val_data = dict_value_map(to_data_point, raw_val_data)

    return train_data, val_data
예제 #11
0
    def average_scores(out_entries: List[QKTokenLevelOutEntry]) -> Dict[WordAsID, np.array]:
        items: List[Iterable[Tuple[WordAsID, TokenScore]]] = lmap(collect_by_word_fn, out_entries)
        d: Dict[WordAsID, List] = defaultdict(list)
        for item in items:
            item: Iterable[Tuple[WordAsID, TokenScore]] = item
            for word, probs in item:
                d[word].append(probs)

        def average_per_dim(probs_list) -> np.array:
            return np.mean(np.array(probs_list), axis=0)

        out_d: Dict[WordAsID, np.array] = dict_value_map(average_per_dim, d)
        return out_d
예제 #12
0
def get_extended_eval_candidate_as_qck_raw(
        split) -> Dict[str, List[QCKCandidate]]:
    c: Dict[int, List[int]] = get_extended_eval_candidate(split)

    def convert_candidates(candidates: List[int]) -> List[QCKCandidate]:
        p_texts = lmap(perspective_getter, candidates)
        l: List[QCKCandidate] = []
        for pid, text in zip(candidates, p_texts):
            l.append(QCKCandidate(str(pid), text))
        return l

    c2: Dict[int, List[QCKCandidate]] = dict_value_map(convert_candidates, c)
    return dict_key_map(str, c2)
예제 #13
0
def load_candidate_all_passage(
        max_seq_length,
        max_passage_per_doc=10) -> Dict[str, List[QCKCandidateWToken]]:
    candidate_docs: Dict[str, List[SimpleRankedListEntry]] = load_bm25_best()

    def get_doc_id(l: List[SimpleRankedListEntry]):
        return list([e.doc_id for e in l])

    candidate_doc_ids: Dict[str, List[str]] = dict_value_map(
        get_doc_id, candidate_docs)
    token_data: Dict[str, List[str]] = load_robust_tokens_for_predict()
    return load_candidate_all_passage_inner(candidate_doc_ids, token_data,
                                            max_seq_length,
                                            max_passage_per_doc)
예제 #14
0
def get_confidence_list_per_cid(info_dir, prediction_file) -> Dict[int, List[float]]:
    info = load_combine_info_jsons(info_dir)

    def logit_to_score_softmax(logit):
        return scipy.special.softmax(logit)[1]

    scores: Dict[DataID, Tuple[CPIDPair, float, float]] = collect_scores_and_confidence(prediction_file, info, logit_to_score_softmax)
    grouped = group_by(scores.values(), lambda x: x[0])
    print("Group size:", len(grouped))
    entries = group_by_cpid(grouped)

    cid_grouped = group_by(entries, lambda x: x[0])
    verify_confidence_consistency(cid_grouped)

    return dict_value_map(lambda x: x[0][2], cid_grouped)
예제 #15
0
def predict_by_reweighter(bm25_module: BM25, claims, top_k,
                          param) -> List[Tuple[str, List[Dict]]]:

    cid_to_text: Dict[int, str] = claims_to_dict(claims)
    claim_term_weight: Dict[int, Dict[str, float]] = get_claim_term_weighting(
        claims, param)
    nlp = spacy.load("en_core_web_sm")

    def do_stem(t: str) -> str:
        r = bm25_module.tokenizer.stemmer.stem(t)
        return r

    def stem_tokenize(text: str) -> Iterator[str]:
        for t in nlp(text):
            try:
                yield do_stem(t.text)
            except UnicodeDecodeError:
                pass

    def apply_stem(term_weight: Dict[str, float]) -> Dict[str, float]:
        return {do_stem(k): v for k, v in term_weight.items()}

    claim_term_weight: Dict[int, Dict[str, float]] = dict_value_map(
        apply_stem, claim_term_weight)

    def scorer(lucene_score, query_id) -> NamedNumber:
        claim_id, p_id = query_id.split("_")
        c_text = cid_to_text[int(claim_id)]
        p_text = perspective_getter(int(p_id))
        qtf = Counter(stem_tokenize(c_text))
        weight = claim_term_weight[int(claim_id)]

        new_qtf = Counter()
        for k, v in qtf.items():
            try:
                w = weight[k]
                new_qtf[k] = w * v
            except Exception as e:
                print("Exception")
                print(e)
                print(k)

        tf = Counter(stem_tokenize(p_text))
        score = bm25_module.score_inner(new_qtf, tf)
        return score

    r = predict_interface(claims, top_k, scorer)
    return r
예제 #16
0
파일: pc_predict.py 프로젝트: clover3/Chair
 def scorer(lucene_score, query_id) -> NamedNumber:
     nonlocal found_claim
     claim_id, p_id = query_id.split("_")
     i_claim_id = int(claim_id)
     if i_claim_id in q_tf_replace_norm:
         claim_qtf = Counter(
             dict_value_map(lambda x: x * 1, c_qtf_d[i_claim_id]))
         ex_qtf = q_tf_replace_norm[i_claim_id]
         ex_qtf = Counter(dict(ex_qtf.most_common(50)))
         qtf = ex_qtf + claim_qtf
         found_claim.add(i_claim_id)
     else:
         qtf = c_qtf_d[i_claim_id]
     p_text = perspective_getter(int(p_id))
     p_tokens = bm25_module.tokenizer.tokenize_stem(p_text)
     score = bm25_module.score_inner(qtf, Counter(p_tokens))
     return score
예제 #17
0
def eval(
    score_pred_file_name: FileName,
    cpid_resolute_file: FileName,
    n_way=3,
):
    topic = "abortion"
    pred_path: FilePath = pjoin(output_path, score_pred_file_name)
    dpid_resolute: Dict[str, DPID] = load_dpid_resolute(cpid_resolute_file)
    score_d: Dict[DPID,
                  np.ndarray] = get_datapoint_score(pred_path, dpid_resolute,
                                                    "avg")

    def argmax(arr: np.ndarray) -> int:
        return arr.argmax()

    pred_d: Dict[DPID, int] = dict_value_map(argmax, score_d)

    dev_labels = get_dev_labels(topic)
    if n_way == 2:

        def merge_label(e):
            dpid, label = e
            return dpid, {
                0: 0,
                1: 1,
                2: 1,
            }[label]

        dev_labels = lmap(merge_label, dev_labels)

    def fetch_pred(e: Tuple[DPID, int]):
        dpid, label = e
        pred = pred_d[dpid]
        return pred

    gold_list: List[int] = right(dev_labels)
    pred_list: List[int] = lmap(fetch_pred, dev_labels)
    if n_way == 3:
        all_result = eval_3label(gold_list, pred_list)
    elif n_way == 2:
        all_result = eval_2label(gold_list, pred_list)
    else:
        assert False
    print(all_result)
    f1 = sum([result['f1'] for result in all_result]) / n_way
    print("Avg F1 : ", f1)
def make_qcknc_problem(
    passage_score_path: FilePath,
    info_path: FilePath,
    config_path: FilePath,
    split: str,
    save_name: str,
) -> None:
    candidate_dict: Dict[int, List[Dict]] = dict(
        get_eval_candidates_from_pickle(split))
    queries: List[QCKQuery] = get_qck_queries(split)

    config = json.load(open(config_path, "r"))

    def get_pids(l: List[Dict]) -> List[str]:
        return lmap(lambda x: x['pid'], l)

    candidate_id_dict_1: Dict[int, List[str]] = dict_value_map(
        get_pids, candidate_dict)
    candidate_id_dict: Dict[str,
                            List[str]] = dict_key_map(str, candidate_id_dict_1)

    all_candidate_ids = set(flatten(candidate_id_dict.values()))
    candidate_dict: Dict[str, QCKCandidate] = {
        cid: get_qck_candidate_from_candidate_id(cid)
        for cid in all_candidate_ids
    }

    data_id_to_info: Dict = load_combine_info_jsons(info_path, qk_convert_map)
    print("number of dat info ", len(data_id_to_info))
    qk_result: List[Tuple[str, List[QKOutEntry]]] = collect_good_passages(
        data_id_to_info, passage_score_path, config)

    query_dict = {q.query_id: q for q in queries}
    payloads = qck_from_qk_results(qk_result, candidate_id_dict, query_dict,
                                   candidate_dict)

    out_dir = os.path.join(output_path, "cppnc")
    exist_or_mkdir(out_dir)
    save_path = os.path.join(out_dir, save_name + ".tfrecord")
    data_id_man = write_qck_as_tfrecord(save_path, payloads)
    info_save_path = os.path.join(out_dir, save_name + ".info")
    print("Payload size : ", len(data_id_man.id_to_info))

    json.dump(data_id_man.id_to_info, open(info_save_path, "w"))
    print("tfrecord saved at :", save_path)
    print("info saved at :", info_save_path)
예제 #19
0
    def __init__(
        self,
        queries: List[QCKQuery],
        candidates_dict: Dict[str, List[QCKCandidate]],
        is_correct_fn,
    ):
        self.max_seq_length = 512
        self.tokenizer = get_tokenizer()

        def c_list_convert(l: List[QCKCandidate]):
            return lmap(self.get_qck_candidate_w_token, l)

        self.candidates_dict: Dict[str, List[QCKCandidateWToken]] = \
            dict_value_map(c_list_convert, candidates_dict)
        self._is_correct = is_correct_fn
        self.queries: List[QCKQueryWToken] = lmap(self.get_qck_query_w_token,
                                                  queries)
        print("{} insts will made for each kdp".format(
            self.num_insts_per_kdp()))
예제 #20
0
def show_tp(pred_file_path: str, info_file_path: str, input_type: str,
            score_type: str, qrel_path: str):
    judgments_raw: Dict[str, List[Tuple[str,
                                        int]]] = load_qrels_flat(qrel_path)
    judgments = dict_value_map(dict, judgments_raw)
    key_logit = "logits"

    def get_score(entry):
        return get_score_from_logit(score_type, entry[key_logit])

    def get_label(query_id, candidate_id):
        judge_dict = judgments[query_id]
        if candidate_id in judge_dict:
            return judge_dict[candidate_id]
        else:
            return 0

    rows = []
    grouped = load_cache("ck_based_analysis")
    for pair_id, items in grouped.items():
        query_id, kdp_id = pair_id
        if query_id not in judgments:
            continue

        e_list: List[Tuple[str, float]] = []
        n_rel = 0
        for item in items:
            score = get_score(item)
            doc_part_id = item['candidate'].id
            doc_id = get_doc_id(doc_part_id)
            e = (doc_id, score)
            e_list.append(e)

            label = bool(get_label(query_id, doc_id))
            if label:
                if score > 0.1:
                    row = [query_id, kdp_id, doc_part_id, score]
                    rows.append(row)
                n_rel += 1
        row = [len(items), n_rel]
        rows.append(row)

    print_table(rows)
예제 #21
0
def load_candidate_head_as_doc(
        doc_len=400) -> Dict[str, List[QCKCandidateWToken]]:
    top_k = 100
    candidate_docs: Dict[str, List[SimpleRankedListEntry]] = load_bm25_best()
    print("Num queries : ", len(candidate_docs))
    print("Loading robust collection tokens...", end="")
    data: Dict[str, List[str]] = load_robust_tokens_for_predict()
    print("Done")
    print("Total of {} docs".format(len(data)))

    def make_candidate(doc_id: str):
        tokens = data[doc_id]
        return QCKCandidateWToken(doc_id, "", tokens[:doc_len])

    def fetch_docs(
            ranked_list: List[SimpleRankedListEntry]
    ) -> List[QCKCandidateWToken]:
        return list([make_candidate(e.doc_id) for e in ranked_list[:top_k]])

    return dict_value_map(fetch_docs, candidate_docs)
예제 #22
0
def load_candidate_d():
    candidate_docs: Dict[str, List[SimpleRankedListEntry]] = load_bm25_best()

    def get_doc_id(l: List[SimpleRankedListEntry]):
        return list([e.doc_id for e in l])

    candidate_doc_ids: Dict[str, List[str]] = dict_value_map(
        get_doc_id, candidate_docs)
    # token_data: Dict[str, List[str]] = load_robust_tokens_for_predict()
    docs = load_from_pickle("robust04_docs_predict")

    out_d = {}
    top_k = 100
    for query_id, doc_id_list in candidate_doc_ids.items():
        new_entries = []
        for doc_id in doc_id_list[:top_k]:
            # tokens = token_data[doc_id]
            content = docs[doc_id]
            new_entries.append((doc_id, content))

        out_d[query_id] = new_entries
    return out_d
예제 #23
0
파일: qck_common.py 프로젝트: clover3/Chair
def get_qck_candidate_from_ranked_list(
        ranked_list) -> Dict[str, List[QCKCandidate]]:
    d: Dict[str, List[str]] = get_candidate_ids_from_ranked_list(ranked_list)
    return dict_value_map(add_texts, d)
예제 #24
0
def do_job(input_dir, output_dir, info_dir, label_info_path, max_entries,
           job_id):

    exist_or_mkdir(output_dir)
    info_output_dir = output_dir + "_info"
    exist_or_mkdir(info_output_dir)

    label_info: List[Tuple[str, str,
                           int]] = json.load(open(label_info_path, "r"))
    label_info_d = {(str(a), str(b)): c for a, b, c in label_info}

    pred_path = os.path.join(input_dir, str(job_id) + ".score")
    #info_path = os.path.join(info_dir, str(job_id) + ".info")
    info_path = info_dir
    output_path = os.path.join(output_dir, str(job_id))
    info_output_path = os.path.join(info_output_dir, str(job_id))
    info = load_combine_info_jsons(info_path, qck_convert_map, True)
    fetch_field_list = ["vector", "data_id"]

    predictions = join_prediction_with_info(pred_path, info, fetch_field_list)

    def get_qid(entry):
        return entry['query'].query_id

    def get_candidate_id(entry):
        return entry['candidate'].id

    def pair_id(entry) -> Tuple[str, str]:
        return get_qid(entry), get_candidate_id(entry)

    groups: Dict[Tuple[str, str], List[Dict]] = group_by(predictions, pair_id)

    def get_new_entry(entries: List[Dict]):
        if not entries:
            return None
        vectors: Vectors = list([e['vector'] for e in entries])
        key = pair_id(entries[0])
        if key in label_info_d:
            label: Label = label_info_d[key]
        else:
            label: Label = 0

        return vectors, label

    g2: Dict[Tuple[str, str],
             Tuple[Vectors, Label]] = dict_value_map(get_new_entry, groups)
    base = 100 * 1000 * job_id
    max_count = 100 * 1000 * (job_id + 1)
    data_id_manager = DataIDManager(base, max_count)

    def get_out_itr() -> Iterable[Tuple[int, Tuple[Vectors, Label]]]:
        for key, data in g2.items():
            qid, cid = key
            data_info = {
                'qid': qid,
                'cid': cid,
            }
            data_id = data_id_manager.assign(data_info)
            yield data_id, data

    write_to_file(output_path, get_out_itr(), max_entries)
    json.dump(data_id_manager.id_to_info, open(info_output_path, "w"))
예제 #25
0
파일: pc_predict.py 프로젝트: clover3/Chair
def pc_predict_vector_query_and_reweight(
        bm25_module: BM25, q_tf_replace: Dict[int, Counter], claims, top_k,
        param) -> List[Tuple[str, List[Dict]]]:

    cid_to_text: Dict[int, str] = claims_to_dict(claims)
    found_claim = set()
    q_tf_replace_norm = dict_value_map(normalize_counter, q_tf_replace)

    def do_stem(t: str) -> str:
        r = bm25_module.tokenizer.stemmer.stem(t)
        return r

    def apply_stem(term_weight: Dict[str, float]) -> Dict[str, float]:
        return {do_stem(k): v for k, v in term_weight.items()}

    claim_term_weight: Dict[int, Dict[str, float]] = get_claim_term_weighting(
        claims, param)
    claim_term_weight: Dict[int, Dict[str, float]] = dict_value_map(
        apply_stem, claim_term_weight)

    nlp = spacy.load("en_core_web_sm")

    def stem_tokenize(text: str) -> Iterator[str]:
        for t in nlp(text):
            try:
                yield do_stem(t.text)
            except UnicodeDecodeError:
                pass

    def get_qtf(claim_id):
        weight = claim_term_weight[claim_id]
        new_qtf = Counter()
        c_text = cid_to_text[int(claim_id)]
        qtf = Counter(stem_tokenize(c_text))
        print(weight)
        for k, v in qtf.items():
            try:
                if k in weight:
                    w = weight[k]
                    new_qtf[k] = w * v
                else:
                    new_qtf[k] = v
            except Exception as e:
                print("Exception")
                print(e)
                print(k)
        return new_qtf

    c_qtf_d = {k: get_qtf(k) for k in cid_to_text.keys()}

    # for cid, c_text in cid_to_text.items():
    #     c_tokens = bm25_module.tokenizer.tokenize_stem(c_text)
    #     c_qtf_d[cid] = Counter(c_tokens)

    def scorer(lucene_score, query_id) -> NamedNumber:
        nonlocal found_claim
        claim_id, p_id = query_id.split("_")
        i_claim_id = int(claim_id)
        if i_claim_id in q_tf_replace_norm:
            ex_qtf = q_tf_replace_norm[i_claim_id]
            ex_qtf = Counter(dict(ex_qtf.most_common(50)))
            qtf = ex_qtf + c_qtf_d[i_claim_id]
            found_claim.add(i_claim_id)
        else:
            qtf = c_qtf_d[i_claim_id]
        p_text = perspective_getter(int(p_id))
        p_tokens = bm25_module.tokenizer.tokenize_stem(p_text)
        score = bm25_module.score_inner(qtf, Counter(p_tokens))
        return score

    r = predict_interface(claims, top_k, scorer)
    print("{} of {} found".format(len(found_claim), len(claims)))
    return r
예제 #26
0
def prec_recall(pred_file_path: str, info_file_path: str, input_type: str,
                score_type: str, qrel_path: str):
    judgments_raw: Dict[str, List[Tuple[str,
                                        int]]] = load_qrels_flat(qrel_path)
    judgments = dict_value_map(dict, judgments_raw)

    grouped = load_cache("ck_based_analysis")
    key_logit = "logits"

    if grouped is None:
        f_handler = get_format_handler(input_type)
        info: Dict = load_combine_info_jsons(info_file_path,
                                             f_handler.get_mapping(),
                                             f_handler.drop_kdp())
        data: List[Dict] = join_prediction_with_info(pred_file_path, info,
                                                     ["data_id", key_logit])
        grouped = group_by(data, get_qk_pair_id)

    def get_score(entry):
        return get_score_from_logit(score_type, entry[key_logit])

    def get_label(query_id, candidate_id):
        judge_dict = judgments[query_id]
        if candidate_id in judge_dict:
            return judge_dict[candidate_id]
        else:
            return 0

    head = [
        "query_id", "kdp_id", "accuracy", "precision", "recall", "f1", "tp",
        "fp", "tn", "fn"
    ]
    rows = [head]
    for pair_id, items in grouped.items():
        query_id, kdp_id = pair_id
        if query_id not in judgments:
            continue

        e_list: List[Tuple[str, float]] = []

        labels = []
        predictions = []
        for item in items:
            score = get_score(item)
            doc_part_id = item['candidate'].id
            doc_id = get_doc_id(doc_part_id)
            e = (doc_id, score)
            e_list.append(e)
            label = bool(get_label(query_id, doc_id))
            labels.append(label)
            prediction = score > 0.5
            predictions.append(prediction)

        scores = get_acc_prec_recall(predictions, labels)

        row = [
            query_id, kdp_id, scores['accuracy'], scores['precision'],
            scores['recall'], scores['f1'], scores['tp'], scores['fp'],
            scores['tn'], scores['fn']
        ]
        rows.append(row)
    print_table(rows)
예제 #27
0
파일: kdp_para.py 프로젝트: clover3/Chair
def first_pid_as_rep() -> Dict[int, List[int]]:
    id_dict: Dict[int, List[List[int]]] = get_claim_perspective_id_dict()
    id_dict_small: Dict[int, List[int]] = dict_value_map(
        lambda ll: lmap(lambda l: l[0], ll), id_dict)
    return id_dict_small
예제 #28
0
def get_docs_from_q_res_path_top_k(file_path, top_k) -> Dict[str, List[List[str]]]:
    ranked_list_d: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(file_path)
    ranked_list_d = dict_value_map(lambda x:x[:top_k], ranked_list_d)
    return get_docs_from_q_res(ranked_list_d)
예제 #29
0
def get_docs_from_q_res(ranked_list_d: Dict[str, List[SimpleRankedListEntry]]) -> Dict[str, List[List[str]]]:
    print(len(ranked_list_d))
    return dict_value_map(get_docs_from_ranked_list, ranked_list_d)
예제 #30
0
def pc_predict_to_inspect(bm25_module: BM25, q_tf_replace: Dict[int, Counter],
                          q_tf_replace_0: Dict[int, Counter], claims, top_k):
    gold = get_claim_perspective_id_dict()
    q_tf_replace_norm = dict_value_map(normalize_counter, q_tf_replace)
    q_tf_replace_0_norm = dict_value_map(normalize_counter, q_tf_replace_0)

    cid_to_text: Dict[int, str] = claims_to_dict(claims)
    c_qtf_d = {}
    for cid, c_text in cid_to_text.items():
        c_tokens = bm25_module.tokenizer.tokenize_stem(c_text)
        c_qtf_d[cid] = Counter(c_tokens)

    def counter_to_str(c: Dict) -> str:
        s = ""
        for k, v in c.items():
            s += "{0} {1:.2f}".format(k, v) + "\t"
        return s

    for claim in claims:
        cid = claim['cId']
        i_claim_id = int(cid)
        claim_text = claim['text']
        lucene_results = es_helper.get_perspective_from_pool(claim_text, 50)
        candidate_pids = []
        for rank, (_text, _pid, _score) in enumerate(lucene_results):
            candidate_pids.append(_pid)

        if i_claim_id in q_tf_replace_norm:
            claim_qtf = Counter(
                dict_value_map(lambda x: x * 1, c_qtf_d[i_claim_id]))
            ex_qtf = q_tf_replace_norm[i_claim_id]
            ex_qtf = Counter(dict(ex_qtf.most_common(50)))
            qtf = ex_qtf + claim_qtf
        else:
            qtf = c_qtf_d[i_claim_id]

        ranked_list = []
        for pid in candidate_pids:
            p_text = perspective_getter(int(pid))
            p_tokens = bm25_module.tokenizer.tokenize_stem(p_text)
            score = bm25_module.score_inner(qtf, Counter(p_tokens))
            debug_str = ""

            e = score, pid, p_text, debug_str
            ranked_list.append(e)

        gold_pids = gold[cid]

        def is_correct(pid):
            for pids in gold_pids:
                if pid in pids:
                    return True
            return False

        ranked_list.sort(key=lambda x: x[0], reverse=True)

        qtf_idf_applied = {
            k: v * bm25_module.term_idf_factor(k)
            for k, v in qtf.items()
        }
        print()
        print("Claim: ", cid, claim_text)
        for cluster in gold_pids:
            print("-")
            for pid in cluster:
                print(pid, perspective_getter(pid))
        print()
        print("qtf:", counter_to_str(qtf))
        if i_claim_id in q_tf_replace_norm and i_claim_id in q_tf_replace_0_norm:
            print("ex_qtf:", counter_to_str(ex_qtf))
            ex_qtf_0 = q_tf_replace_0_norm[i_claim_id]
            ex_qtf_0 = Counter(dict(ex_qtf_0.most_common(50)))
            print("ex_qtf_0:", counter_to_str(ex_qtf_0))
        print("qtf idf apllied:", counter_to_str(qtf_idf_applied))

        for score, pid, p_text, debug_str in ranked_list[:top_k]:

            if i_claim_id in q_tf_replace_0_norm:
                p_text = perspective_getter(int(pid))
                p_tokens = bm25_module.tokenizer.tokenize_stem(p_text)
                ex_qtf_0 = q_tf_replace_0_norm[i_claim_id]
                qtf = ex_qtf_0 + c_qtf_d[i_claim_id]
                score2 = bm25_module.score_inner(qtf, Counter(p_tokens))
                correct_str = "Y" if is_correct(pid) else "N"
                print("{0} {1:.2f} ({2:.2f}) {3} / {4} / {5}".format(
                    correct_str, score, score2, p_text, score.name,
                    score2.name))