Пример #1
0
def main():
    claim_text_d: Dict[int, str] = get_all_claim_d()
    claim_text_d: Dict[str, str] = dict_key_map(str, claim_text_d)
    evi_dict: Dict[str, str] = dict_key_map(str, load_evidence_dict())
    evi_gold_dict: Dict[str, List[int]] = evidence_gold_dict_str_qid()
    print("V2")

    def print_entry(entry):
        evidence_text = evi_dict[entry.doc_id]
        print("[{}] {}: {}".format(entry.rank, entry.doc_id, evidence_text))

    ranked_list_dict = load_ranked_list_grouped(sys.argv[1])
    for query, ranked_list in ranked_list_dict.items():
        print()

        claim_id, perspective_id = query.split("_")
        gold_ids: List[str] = lmap(str, evi_gold_dict[query])
        if not gold_ids:
            print("query {} has no gold".format(query))
            continue
        assert gold_ids
        claim_text = claim_text_d[claim_id]
        perspective_text = perspective_getter(int(perspective_id))

        pos_entries = []
        neg_entries = []
        for entry in ranked_list:
            label = entry.doc_id in gold_ids
            if label:
                pos_entries.append(entry)
            elif entry.rank < 3:
                neg_entries.append(entry)

        if not pos_entries:
            print("gold not in ranked list")
            continue

        num_rel = len(pos_entries)

        correctness = []
        for entry in ranked_list[:num_rel]:
            label = entry.doc_id in gold_ids
            correctness.append(int(label))

        precision = average(correctness)
        if precision > 0.99:
            print("Good")
            continue
        print("precision at {}: {}".format(num_rel, precision))

        print("Claim: ", claim_text)
        print("perspective_text: ", perspective_text)
        print(" < GOLD >")
        foreach(print_entry, pos_entries)
        print(" < False Positive >")
        foreach(print_entry, neg_entries)
def main():
    claim_text_d: Dict[int, str] = get_all_claim_d()
    evidence_d = load_evidence_dict()
    evidence_gold = evidence_gold_dict()
    while True:
        s = input()
        cid, pid = s.split("_")
        cid = int(cid)
        pid = int(pid)
        print("Claim: ", claim_text_d[cid])
        print("Perspective: ", perspective_getter(pid))
        key = cid, pid
        e_ids = evidence_gold[key]
        for eid in e_ids:
            print("Evidence: ", evidence_d[eid])
Пример #3
0
def get_query_lms(split) -> Dict[str, Counter]:
    evi_dict: Dict[int, str] = load_evidence_dict()
    tokenzier = PCTokenizer()
    queries = get_qck_queries(split)
    evi_gold_dict: Dict[str, List[int]] = evidence_gold_dict_str_qid()

    def get_evidence_texts(query: QCKQuery) -> List[str]:
        query_id = query.query_id
        e_ids: List[int] = evi_gold_dict[query_id]
        return list([evi_dict[eid] for eid in e_ids])

    def get_query_lm(query: QCKQuery) -> Counter:
        return text_list_to_lm(tokenzier, get_evidence_texts(query))

    lms = lmap(get_query_lm, queries)
    qids = lmap(QCKQuery.get_id, queries)
    query_lms: Dict[str, Counter] = dict(zip(qids, lms))
    return query_lms
Пример #4
0
def get_ex_candidate_for_training(split,
                                  balanced=True,
                                  cached=False
                                  ) -> Dict[str, List[QCKCandidateI]]:
    if cached:
        bow_ranked = load_top_rank_candidate(split)
    else:
        bow_ranked = get_candidate(split)
    tokenizer = get_tokenizer()
    evi_dict: Dict[int, str] = load_evidence_dict()
    evi_gold_dict: Dict[str, List[int]] = evidence_gold_dict_str_qid()
    queries = get_qck_queries(split)
    max_seq_length = 512
    out_d = {}
    for query in queries:
        qid = query.query_id
        c_list = bow_ranked[qid]
        gold_e_ids: List[int] = evi_gold_dict[qid]
        top_ranked: List[int] = lmap(int, map(QCKCandidate.get_id, c_list))
        query_len = len(tokenizer.tokenize(query.text))
        candidate_max_len = max_seq_length - 3 - query_len
        neg_e_ids = []
        for e_id in set(top_ranked):
            if e_id not in gold_e_ids:
                neg_e_ids.append(e_id)
            if balanced and len(neg_e_ids) == len(gold_e_ids):
                break

        def make_candidate(e_id: int) -> Iterable[QCKCandidate]:
            text = evi_dict[e_id]
            tokens = tokenizer.tokenize(text)
            for passage in enum_passage(tokens, candidate_max_len):
                yield QCKCandidateWToken(str(e_id), "", passage)

        new_list = lflatten(map(make_candidate, gold_e_ids + neg_e_ids))
        out_d[qid] = new_list
    return out_d
Пример #5
0
def insert():
    e_dict = load_evidence_dict()
    for e_id, text in e_dict.items():
        p = {'id': e_id, 'text': text}
        r = es.index(index=index_name, body=p)