示例#1
0
def save_to_csv():
    gold = get_claim_perspective_id_dict()

    def routine(claims, out_path):
        payloads = predict_by_elastic_search(claims, 50)
        head = ['sentence1', 'sentence2', 'gold_label', 'cid', 'pid']
        rows = []
        for cid, data_list in payloads:
            gold_pids = gold[cid]
            all_pid_set = set(flatten(gold_pids))
            for p_entry in data_list:
                c_text = p_entry['claim_text']
                p_text = p_entry['perspective_text']
                y = 1 if p_entry['pid'] in all_pid_set else 0
                row = [c_text, p_text, y, cid, p_entry['pid']]
                rows.append(row)
        f_out = csv.writer(open(out_path, "w", encoding="utf-8"),
                           dialect='excel-tab')
        f_out.writerows([head] + rows)

    claims, val = train_split()
    routine(claims, get_file_path('train'))
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    routine(claims, get_file_path('dev'))
    d_ids: List[int] = list(load_test_claim_ids())
    claims = get_claims_from_ids(d_ids)
    routine(claims, get_file_path('test'))
示例#2
0
def run_bm25_rm():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    rm_info = load_from_pickle("perspective_dev_claim_rm")
    top_k = 7
    pred = predict_by_bm25_rm(get_bm25_module(), rm_info, claims, top_k)
    print(evaluate(pred))
示例#3
0
def start_generate_jobs_for_train_val(
        generator_functor: Callable[[Dict[int, List[Tuple[List[str], float]]]],
                                    CPPNCGeneratorInterface], writer,
        name_prefix):
    # claim ids split to train/val
    d_ids: List[int] = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    train, val = split_7_3(claims)
    data = load_from_pickle("pc_train_a_passages")
    entries, all_passages = data
    cid_to_passages: Dict[int, List[Tuple[List[str], float]]] = {
        claim['cId']: p
        for claim, p in entries
    }
    generator = generator_functor(cid_to_passages)

    print("Generate instances : train")

    def worker_factory(out_dir):
        return CPPNCWorker(train, generator, writer, out_dir)

    runner = JobRunner(job_man_dir, 378, name_prefix + "_train",
                       worker_factory)
    runner.start()

    print("Generate instances : val")

    def worker_factory(out_dir):
        return CPPNCWorker(val, generator, writer, out_dir)

    runner = JobRunner(job_man_dir, 162, name_prefix + "_val", worker_factory)
    runner.start()
示例#4
0
def get_perspective_candidates(claim_id):
    from arg.perspectives import es_helper
    claims = get_claims_from_ids([claim_id])
    claim_text = claims[0]['text']
    lucene_results = es_helper.get_perspective_from_pool(claim_text, 50)
    for _text, _pid, _score in lucene_results:
        yield _text, _pid
示例#5
0
def claim_language_model_property():
    dev_claim_ids = load_dev_claim_ids()
    claims = get_claims_from_ids(dev_claim_ids)
    all_ranked_list = ClaimRankedList()
    all_voca = set()
    candidate_k = 50
    for claim in claims:
        claim_text, perspectives = get_perspective(claim, candidate_k)
        print(claim_text)
        unigrams = get_relevant_unigrams(perspectives)
        ranked_list = all_ranked_list.get(str(claim['cId']))
        doc_ids = [t[0] for t in ranked_list]
        print("Loading documents")
        preload_tf(doc_ids)
        docs = lmap(load_and_format_doc, doc_ids)

        foreach(lambda doc: all_voca.update(doc['tokens_set']), docs)

        # check hypothesis
        # check_hypothesis(all_voca, cdf_cont, cdf_ncont, clueweb_cdf, clueweb_ctf, clueweb_df, clueweb_tf, ctf_cont,
        #                  ctf_ncont, df_cont, df_ncont, tf_cont, tf_ncont, unigrams)

        print("counting terms stat")

        lm_classifier = build_lm(docs, unigrams)

        for p_entry in perspectives:
            _text, _pid, _score = p_entry
            tokens = nltk.word_tokenize(_text)
            score = sum(lmap(lm_classifier.per_token_odd, tokens))
            print(_text, score)
示例#6
0
def main():
    docs: Dict[str, List[List[str]]] = load_from_pickle("dev_claim_docs")
    _, clue12_13_df = load_clueweb12_B13_termstat()
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    r = select_paragraph(docs, clue12_13_df, claims, "topk")
    save_to_pickle(r, "dev_claim_paras")
示例#7
0
 def write(claim_ids, split_name):
     claims = get_claims_from_ids(claim_ids)
     queries = get_claims_query(claims, True)
     out_path = os.path.join(
         output_path,
         "perspective_{}_claim_query_k0.json".format(split_name))
     save_queries_to_file(queries, out_path)
示例#8
0
def start_generate_jobs_for_train_val(generator: InstanceGenerator,
                                      name_prefix):
    # claim ids split to train/val
    print("Loading data ....")
    d_ids: List[int] = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    train, val = split_7_3(claims)

    train_cids = {str(t['cId']) for t in train}
    val_cids = {str(t['cId']) for t in val}
    qk_candidate: List[QKUnit] = load_qk_candidate_train()
    print("Generate instances : train")
    qk_candidate_train: List[QKUnit] = list(
        [qk for qk in qk_candidate if qk[0].query_id in train_cids])
    qk_candidate_val = list(
        [qk for qk in qk_candidate if qk[0].query_id in val_cids])

    def worker_factory(out_dir):
        return QCKWorker(qk_candidate_train, generator, out_dir)

    runner = JobRunner(job_man_dir, 378, name_prefix + "_train",
                       worker_factory)
    runner.start()

    print("Generate instances : val")

    def worker_factory(out_dir):
        return QCKWorker(qk_candidate_val, generator, out_dir)

    runner = JobRunner(job_man_dir, 162, name_prefix + "_val", worker_factory)
    runner.start()
示例#9
0
 def submit_jobs_inner(claim_ids, split_name):
     claims = get_claims_from_ids(claim_ids)
     queries = get_claims_query(claims)
     out_root = "/mnt/nfs/work3/youngwookim/data/perspective/{}_claim_rm3".format(
         split_name)
     exist_or_mkdir(out_root)
     submit_rm_jobs(queries, out_root)
示例#10
0
def get_eval_candidates(split, top_k=50) -> List[Tuple[int, List[Dict]]]:
    # split -> claims
    d_ids = load_claim_ids_for_split(split)
    claims: List[Dict] = get_claims_from_ids(d_ids)
    tokenizer = PCTokenizer()

    def get_candidates(c: Dict) -> Tuple[int, List[Dict]]:
        cid = c["cId"]
        assert type(cid) == int
        claim_text = c["text"]
        lucene_results = es_helper.get_perspective_from_pool(claim_text, top_k)
        candidate_list = []
        for rank, (_text, _pid, _score) in enumerate(lucene_results):
            rationale = "es_rank={} , es_score={}".format(rank, _score)
            p_entry = {
                'cid': cid,
                'pid': _pid,
                'claim_text': claim_text,
                'perspective_text': _text,
                'p_tokens': tokenizer.tokenize_stem(_text),
                'rationale': rationale,
            }
            candidate_list.append(p_entry)
        return cid, candidate_list

    candidates: List[Tuple[int, List[Dict]]] = lmap(get_candidates, claims)
    return candidates
示例#11
0
def sum_random_walk_score(name_class):
    d_ids: List[int] = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    claim_d = claims_to_dict(claims)

    prob_score_d = load_from_pickle("pc_{}_word_prob_train".format(name_class))
    stopwords = load_stopwords()
    acc_counter_prob_init = Counter()
    for claim_id, prob_scores in prob_score_d.items():
        for k, v in prob_scores:
            if k not in stopwords:
                acc_counter_prob_init[k] += v

    rw_score = dict(load_from_pickle("bias_random_walk_train_{}".format(name_class)))
    acc_counter = Counter()
    for claim_id, qtf in rw_score.items():
        for k, v in qtf.items():
            acc_counter[k] += v

    acc_counter_prob_init = normalize_counter_to_sum1(acc_counter_prob_init)
    acc_counter = normalize_counter_to_sum1(acc_counter)

    new_counter = Counter()
    for k, v in acc_counter.items():
        if len(k) > 2:
            new_v = v - acc_counter_prob_init[k]
            new_counter[k] = new_v

    return new_counter
示例#12
0
def pc_new_init_prob():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    claim_d = claims_to_dict(claims)
    bias_plus_word: Counter = load_from_pickle("bias_plus_words")
    tokenizer = PCTokenizer()

    base_p = max(bias_plus_word.values())

    init_p_score_d = {}
    for cid in d_ids:
        c_text = claim_d[cid]
        tokens = tokenizer.tokenize_stem(c_text)

        score_for_cid = Counter()
        for t, cnt in Counter(tokens).items():
            prob = cnt * base_p
            score_for_cid[t] = prob

        for t, score in bias_plus_word.items():
            score_for_cid[t] += score

        score_for_cid = normalize_counter_to_sum1(score_for_cid)
        init_p_score_d[cid] = score_for_cid

    save_to_pickle(init_p_score_d, "pc_dev_new_init_prob")
示例#13
0
def run_reweight():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 7
    param = {'k1': 0.5}
    pred = predict_by_reweighter(get_bm25_module(), claims, top_k, param)
    print(evaluate(pred))
示例#14
0
def save_dev():
    save_name = "pc_dev_a_passages"
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/dev_claim/q_res_100")
    d_ids = list(load_dev_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    a_relevant_candidate(save_name, q_res_path, claims)
示例#15
0
def work():
    d_ids = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    is_train = True
    all_data_points = get_candidates(claims, is_train)
    all_data_points = all_data_points[:10]
    binary_feature_demo(all_data_points)
示例#16
0
def write_claim_perspective_pair_as_query():
    split = "dev"
    assert split in ["train", "dev", "test"]

    d_ids = list({
        "train": load_train_claim_ids(),
        "dev": load_dev_claim_ids(),
        "test": load_test_claim_ids()
    }[split])
    claims = get_claims_from_ids(d_ids)
    print(len(claims), " claims")
    is_train = split == "train"
    all_data_points = get_candidates(claims, is_train)
    k = 0

    def get_query_entry_from_data_point(x: PerspectiveCandidate) -> DocQuery:
        tokens = clean_tokenize_str_to_tokens(x.claim_text + " " + x.p_text)
        qid = "{}_{}".format(x.cid, x.pid)
        return format_query_bm25(qid, tokens, k)

    queries = lmap(get_query_entry_from_data_point, all_data_points)

    out_dir = query_dir_format.format(split)
    exist_or_mkdir(out_dir)
    n_query_per_file = 50

    write_queries_to_files(n_query_per_file, out_dir, queries)
示例#17
0
def run_eval_with_dict(pickle_name):
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    print("targets", len(claims))
    top_k = 8
    pc_score_d = load_from_pickle(pickle_name)
    pred = predict_from_dict(pc_score_d, claims, top_k)
    print(evaluate(pred))
示例#18
0
def get_qck_queries_from_cids(d_ids: List[int]):
    claims: List[Dict] = get_claims_from_ids(d_ids)

    def claim_to_query(claim: Dict):
        return QCKQuery(str(claim['cId']), claim['text'])

    queries: List[QCKQuery] = lmap(claim_to_query, claims)
    return queries
示例#19
0
def get_claim_lms() -> Dict[str, Counter]:
    split = "train"
    qids = list(get_qids_for_split(split_name2, split))
    cids = lmap(int, qids)
    claims = get_claims_from_ids(cids)
    claim_lms = build_gold_lms(claims)
    claim_lms_dict: Dict[str, Counter] = {str(claim_lm.cid): claim_lm.LM for claim_lm in claim_lms}
    return claim_lms_dict
示例#20
0
def main():
    split = sys.argv[1]

    ids = load_claim_ids_for_split(split)
    claims = get_claims_from_ids(ids)

    for c in claims:
        print("Claim {} :\t{}".format(c['cId'], c['text']))
示例#21
0
def write_simple_claim_queries():
    for split in splits:
        claim_ids = load_claim_ids_for_split(split)
        claims = get_claims_from_ids(claim_ids)
        queries = get_simple_claim_query(claims, True)
        out_path = os.path.join(output_path, "perspective_query",
                                "simple_query_{}.json".format(split))
        save_queries_to_file(queries, out_path)
示例#22
0
def main():
    args = parser.parse_args(sys.argv[1:])
    save_name = args.save_name
    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    candidate_perspectives: Dict[int, List[Dict]] = dict(
        get_eval_candidates_from_pickle("train"))
    make_cppnc_dummy_problem(claims, candidate_perspectives, save_name,
                             encode_two_inputs)
示例#23
0
def run_eval_with_two_dict():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    print("targets", len(claims))
    top_k = 7
    pc_score_d = load_from_pickle("pc_bert_baseline_score_d")
    pc_score_d2 = load_from_pickle("pc_random_walk_based_score_d")
    pred = predict_from_two_dict(pc_score_d, pc_score_d2, claims, top_k)
    print(evaluate(pred))
示例#24
0
def save_random_walk_pred():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 50
    q_tf_replace = dict(load_from_pickle("random_walk_score_100"))
    bm25 = get_bm25_module()
    pred = pc_predict_from_vector_query(bm25, q_tf_replace, claims, top_k)
    score_d = prediction_to_dict(pred)
    save_to_pickle(score_d, "pc_random_walk_based_score_d")
示例#25
0
def a_relevant():
    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    claims = claims[:10]
    top_n = 100
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)

    stopwords = load_stopwords_for_query()
    alpha = 0.7

    tokenizer = PCTokenizer()
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        docs = []
        for i in range(top_n):
            try:
                doc = load_doc(q_res[i].doc_id)
                docs.append(doc)
            except KeyError:
                docs.append(None)
                pass

        print(c['text'])
        rows = []
        for rank, doc in enumerate(docs):
            if doc is None:
                rows.append((rank, "-", "-"))
                continue

            scores = get_doc_score(doc, get_passage_score)
            avg_score = average(scores)
            max_score = max(scores)
            rows.append((rank, avg_score, max_score))

        print_table(rows)
示例#26
0
def run_random_walk_score_with_weight():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 7
    q_tf_replace = dict(load_from_pickle("random_walk_score_100"))
    q_tf_replace = dict_key_map(lambda x: int(x), q_tf_replace)
    bm25 = get_bm25_module()
    pred = pc_predict_vector_query_and_reweight(bm25, q_tf_replace, claims,
                                                top_k, {'k1': 0.5})
    print(evaluate(pred))
示例#27
0
def run_lm():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 5
    q_tf_replace = dict(load_from_pickle("pc_dev_par_tf"))
    q_tf_replace = dict(load_from_pickle("random_walk_score_100"))
    bm25 = get_bm25_module()
    ctf = load_collection_tf()
    pred = predict_by_lm(q_tf_replace, ctf, bm25, claims, top_k)
    print(evaluate(pred))
示例#28
0
def run_write_claims_as_plain_query():
    for claim_ids, out_name in [
        (load_train_claim_ids(), "train_claim_query_raw.txt"),
        (load_dev_claim_ids(), "dev_claim_query_raw.txt")
    ]:
        claims = get_claims_from_ids(claim_ids)
        q_str_list = get_claims_as_plain_query(claims)
        f = open(pjoin(output_path, out_name), "w")
        for s in q_str_list:
            f.write(s + "\n")
示例#29
0
def generate_classification_payload():
    claims, val = train_split()
    top_k = 50
    pred = predict_by_elastic_search(claims, top_k)
    save_to_pickle(pred, "perspective_cls_train_X")
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 50
    pred = predict_by_elastic_search(claims, top_k)
    save_to_pickle(pred, "perspective_cls_dev_X")
示例#30
0
def a_relevant():
    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claims = claims
    top_n = 10
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords = load_stopwords_for_query()
    alpha = 0.3

    tokenizer = PCTokenizer()
    all_passages = []
    entries = []
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        claim_text = c['text']
        claim_tokens = tokenizer.tokenize_stem(claim_text)

        scores = []
        for t in claim_tokens:
            if t in log_odd:
                scores.append(log_odd[t])
        base = average(scores)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        passages = iterate_passages(q_res, top_n, get_passage_score)

        all_passages.extend(passages)
        a_rel_passages = lfilter(lambda x: x[1] > 0, passages)

        entries.append((c, a_rel_passages))

    data = entries, all_passages

    save_to_pickle(data, "pc_train_a_passages")