Exemplo n.º 1
0
def enum_all_argument(split) -> Iterable[Passage]:
    assert split in header.splits
    all_topic_dir = pjoin(extracted_arguments, split)

    for topic in header.topics:
        per_topic_dir = pjoin(all_topic_dir, topic)
        for maybe_dir_obj in os.scandir(per_topic_dir):
            if not maybe_dir_obj.is_dir():
                continue
            dir_path = maybe_dir_obj.path

            con_dir = pjoin(dir_path, "_con")
            pro_dir = pjoin(dir_path, "pro")

            def load_files_in_dir(target_dir_path):
                assert os.path.basename(target_dir_path) in ["_con", "pro"]
                for file_path in get_dir_files(target_dir_path):
                    content = open(file_path, "r", encoding='utf-8').read()
                    rel_path = get_rel_path(file_path, extracted_arguments)
                    yield Passage(content, ArguDataID.from_rel_path(rel_path))

            for item in load_files_in_dir(con_dir):
                yield item
            for item in load_files_in_dir(pro_dir):
                yield item
Exemplo n.º 2
0
def show_bert_nli_diff():
    model_dir = pjoin(pjoin(output_path, FileName("model")), FileName("runs"))
    nli = os.path.join(model_dir, FileName("nli"),
                       FileName("model.ckpt-75000"))
    bert = os.path.join(model_dir, FileName("uncased_L-12_H-768_A-12"),
                        FileName("bert_model.ckpt"))

    show_embedding_difference(bert, nli)
Exemplo n.º 3
0
def alt_from_clueweb12_13A():
    model_dir = pjoin(output_path, "models")
    nli_checkpoint = pjoin(pjoin(model_dir, "nli_bert_300_K"),
                           "model.ckpt-73150")
    alt_emb_checkpoint = pjoin(pjoin(model_dir, "alt_emb_H"),
                               "model.ckpt-20000")
    save_path = os.path.join(output_path, "models", "nli_alt_emb_H20K",
                             "model.ckpt-73150")
    combine(nli_checkpoint, alt_emb_checkpoint, save_path)
Exemplo n.º 4
0
def get_nli_and_bert_embeddings():
    model_dir = pjoin(pjoin(output_path, FileName("model")), FileName("runs"))
    nli = os.path.join(model_dir, FileName("nli"),
                       FileName("model.ckpt-75000"))
    bert = os.path.join(model_dir, FileName("uncased_L-12_H-768_A-12"),
                        FileName("bert_model.ckpt"))
    nli_emb = get_embedding_table(nli)
    bert_emb = get_embedding_table(bert)
    return bert_emb, nli_emb
Exemplo n.º 5
0
def run_dir(in_dir_name: FileName, out_dir_name: FileName):
    in_dir = pjoin(sydney_working_dir, in_dir_name)
    out_dir = pjoin(sydney_working_dir, out_dir_name)
    exist_or_mkdir(out_dir)

    for file_path in get_dir_files(in_dir):
        name = FileName(os.path.basename(file_path))
        out_path = pjoin(out_dir, name)
        convert_to_2way(file_path, out_path)
Exemplo n.º 6
0
def count_tf():
    continuation_tokens = get_continuation_token_ids()

    dir_path = pjoin(output_path, FileName("nli_tfrecord_cls_300"))

    tf_train = build_word_tf(continuation_tokens, pjoin(dir_path, FileName("train")))
    tf_dev = build_word_tf(continuation_tokens, pjoin(dir_path, FileName("dev_mis")))

    save_to_pickle(tf_dev, "nli_tf_dev_mis")
    save_to_pickle(tf_train, "nli_tf_train")
Exemplo n.º 7
0
def count_tf():
    continuation_tokens = get_continuation_token_ids()

    out_dir = pjoin(output_path, "eHealth")
    train_save_path = pjoin(out_dir, "tfrecord_train")
    test_save_path = pjoin(out_dir, "tfrecord_test")
    tf_train = build_word_tf(continuation_tokens, train_save_path)
    tf_dev = build_word_tf(continuation_tokens, test_save_path)

    save_to_pickle(tf_dev, "eHealth_tf_train")
    save_to_pickle(tf_train, "eHealth_tf_dev")
Exemplo n.º 8
0
def main():
    in_dir = pjoin(output_path, "eHealth")
    exist_or_mkdir(in_dir)
    input_path_train = pjoin(in_dir, "tfrecord_train")
    input_path_test = pjoin(in_dir, "tfrecord_test")

    out_dir = os.path.join(output_path, "ehealth_alt")
    exist_or_mkdir(out_dir)
    output_file_path_train = os.path.join(out_dir, "train")
    output_file_path_test = os.path.join(out_dir, "test")

    convert(input_path_test, output_file_path_test)
Exemplo n.º 9
0
def count_tf():
    continuation_tokens = get_continuation_token_ids()

    dir_path = pjoin(output_path, FileName("eHealth"))

    tf_train = build_word_tf(continuation_tokens,
                             pjoin(dir_path, FileName("tfrecord_train")))
    tf_dev = build_word_tf(continuation_tokens,
                           pjoin(dir_path, FileName("tfrecord_test")))

    save_to_pickle(tf_train, "clef1_tf_train")
    save_to_pickle(tf_dev, "clef1_tf_test")
Exemplo n.º 10
0
def count_tf():
    continuation_tokens = get_continuation_token_ids()

    dataset_dir = pjoin(data_path, "ukp_300")
    for topic in all_topics:
        train_data_path = pjoin(dataset_dir, "train_{}".format(topic))
        test_data_path = pjoin(dataset_dir, "dev_{}".format(topic))
        tf_train = build_word_tf(continuation_tokens, train_data_path)
        tf_dev = build_word_tf(continuation_tokens, test_data_path)

        save_to_pickle(tf_train, "tf_train_{}".format(topic))
        save_to_pickle(tf_dev, "tf_dev_{}".format(topic))
Exemplo n.º 11
0
def combine_nli_alt_emb():
    model_dir = pjoin(output_path, "models")
    nli_checkpoint = pjoin(pjoin(model_dir, "nli_bert_300_K"),
                           "model.ckpt-73150")
    #alt_emb_checkpoint = pjoin(pjoin(model_dir, "alt_emb_F"), "model.ckpt-10000")
    #alt_emb_checkpoint = pjoin(pjoin(model_dir, "alt_emb_F"), "model.ckpt-10000")
    #alt_emb_checkpoint = pjoin(pjoin(model_dir, "alt_emb_G"), "model.ckpt-0")
    alt_emb_checkpoint = pjoin(pjoin(model_dir, "alt_emb_G"),
                               "model.ckpt-100000")

    save_path = os.path.join(output_path, "models", "nli_alt_emb_100KF",
                             "model.ckpt-73150")
    combine(nli_checkpoint, alt_emb_checkpoint, save_path)
Exemplo n.º 12
0
def load_label(split, topic) -> Iterator:
    split_dir = pjoin(pair_best_counter, split)

    topic_dir = pjoin(split_dir, topic)

    file_list = [
        "01-debate-opposing-counters.tsv",
        "02-debate-counters.tsv",
        "03-debate-opposing-arguments.tsv",
        "04-debate-arguments.tsv",
        "05-theme-counters.tsv",
        "06-theme-arguments.tsv"
    ]

    return load_tsv_or_from_zip(topic_dir, file_list[4])
Exemplo n.º 13
0
def main():
    dataset_dir = pjoin(data_path, "ukp_300")

    for topic in all_topics:
        train_data_path = pjoin(dataset_dir, "train_{}".format(topic))
        test_data_path = pjoin(dataset_dir, "dev_{}".format(topic))

        out_dir = os.path.join(output_path, "ukp_alt")
        exist_or_mkdir(out_dir)
        output_file_path_train = os.path.join(out_dir,
                                              "train_{}".format(topic))
        output_file_path_test = os.path.join(out_dir, "dev_{}".format(topic))

        convert(test_data_path, output_file_path_test)
        convert(train_data_path, output_file_path_train)
Exemplo n.º 14
0
def save_concat_dev():
    #    prediction_path = pjoin(output_path, "pc_long_seq11")
    prediction_path = pjoin(output_path, "pc_long_focus_1")
    scores: Dict[CPID, List[float]] = collect_pipeline2_score(
        prediction_path, "pc_rel_dev_info_all")
    reduced_score: Dict[CPID, float] = dict_value_map(sum, scores)
    save_to_pickle(reduced_score, "pc_concat_dev_score")
Exemplo n.º 15
0
def bert_baseline_repeat():
    info = load_from_pickle("eHealth_test_info")
    for i in [3,4,5]:
        prediction_name = "eHealth_bert_freeze_{}".format(i)
        pred_data = EstimatorPredictionViewerGosford(prediction_name)
        out_path = pjoin(subdir_root, "bert_baseline_{}.txt".format(i))
        prediction_to_ranked_list(pred_data, info, out_path)
Exemplo n.º 16
0
def save_for_train():
    info = load_from_pickle("pc_rel_info_all")
    prediction_path = pjoin(output_path, "pc_rel")
    rel_info: Dict[DataID, Tuple[CPIDPair, Logits,
                                 Logits]] = combine_pc_rel_with_cpid(
                                     prediction_path, info)
    save_to_pickle(rel_info, "pc_rel_with_cpid")
Exemplo n.º 17
0
def pc_predict_by_bert_next_sent(bm25_module: BM25, claims,
                                 top_k) -> List[Tuple[str, List[Dict]]]:

    cid_to_text: Dict[int, str] = claims_to_dict(claims)
    port = 8123
    # Example usage :
    proxy = xmlrpc.client.ServerProxy(
        'http://ingham.cs.umass.edu:{}'.format(port))

    voca_path = pjoin(data_path, "bert_voca.txt")
    encoder = EncoderUnitPlain(512, voca_path)

    def scorer(lucene_score, query_id) -> NamedNumber:
        claim_id, p_id = query_id.split("_")
        i_claim_id = int(claim_id)
        payload = []
        p_text = perspective_getter(int(p_id))
        c_text = cid_to_text[i_claim_id]
        payload.append(encoder.encode_pair(c_text, p_text))
        r = proxy.predict(payload)
        ns_score = -float(r[0])
        #ns_score = 0
        score = bm25_module.score(c_text, p_text)
        new_score = score + ns_score * 10
        score = NamedNumber(new_score, score.name + " {}".format(ns_score))
        return score

    r = predict_interface(claims, top_k, scorer)
    return r
Exemplo n.º 18
0
def compare_before_after():
    tokenizer = get_tokenizer()

    ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize("heavy metal"))
    dir_name = pjoin(pjoin(output_path, FileName("model")),
                     FileName("alt_emb_heavy_metal_D"))
    before = pjoin(dir_name, FileName("model.ckpt-0"))
    after = pjoin(dir_name, FileName("model.ckpt-10000"))

    v1_d = load_checkpoint_vars(before)
    v2_d = load_checkpoint_vars(after)

    for key in v1_d:
        if key in v2_d:
            s = np.sum(v1_d[key] - v2_d[key])
            if np.abs(s) > 0.01:
                print(key, s)

    ori_emb = v2_d['bert/embeddings/word_embeddings']
    alt_emb_before = v1_d['bert/embeddings/word_embeddings_alt']
    alt_emb_after = v2_d['bert/embeddings/word_embeddings_alt']

    def show_diff_from_ori(token_id):
        diff = np.sum(np.abs(ori_emb[token_id] - alt_emb_after[token_id]))
        print(token_id, diff)

    def show_diff_from_step0(token_id):
        diff = np.sum(
            np.abs(alt_emb_before[token_id] - alt_emb_after[token_id]))
        print(token_id, diff)

    print("Diff against original embedding")
    print("Target words")
    for token_id in ids:
        show_diff_from_ori(token_id)

    print("Random words")
    for token_id in [321, 598, 5854]:
        show_diff_from_ori(token_id)

    print("Diff against step0 random init embedding")
    print("Target words")
    for token_id in range(0, 30000):
        diff = np.sum(
            np.abs(alt_emb_before[token_id] - alt_emb_after[token_id]))
        if diff > 0.001:
            print(token_id, diff)
Exemplo n.º 19
0
def main():
    train_queries, test_queries = get_query_split()
    out_dir = pjoin(output_path, "eHealth")
    exist_or_mkdir(out_dir)
    train_save_path = pjoin(out_dir, "tfrecord_train")
    test_save_path = pjoin(out_dir, "tfrecord_test")
    ranked_list_path = FilePath(
        os.path.join(output_path, "eHealth", "bm25_filtered.list"))
    ranked_list: RankedListDict = load_galago_ranked_list(ranked_list_path)
    qrels = load_clef_qrels()

    train_info = write_tfrecord(ranked_list, train_queries, qrels,
                                train_save_path)
    save_to_pickle(train_info, "eHealth_train_info")
    test_info = write_tfrecord(ranked_list, test_queries, qrels,
                               test_save_path)
    save_to_pickle(test_info, "eHealth_test_info")
Exemplo n.º 20
0
def print_features():
    job_dir = "ukp_paragraph_feature_2"
    job_id = 0
    file_path = os.path.join(sydney_working_dir, job_dir, str(job_id))
    features: List[ParagraphFeature] = pickle.load(
        open(os.path.join(file_path), "rb"))

    out_path = pjoin(output_path, FileName("ukp_paragraph_feature_2.html"))
    print_paragraph_feature(features, out_path)
Exemplo n.º 21
0
def run_write_claims_as_plain_query():
    for claim_ids, out_name in [
        (load_train_claim_ids(), "train_claim_query_raw.txt"),
        (load_dev_claim_ids(), "dev_claim_query_raw.txt")
    ]:
        claims = get_claims_from_ids(claim_ids)
        q_str_list = get_claims_as_plain_query(claims)
        f = open(pjoin(output_path, out_name), "w")
        for s in q_str_list:
            f.write(s + "\n")
Exemplo n.º 22
0
def load_tsv_or_from_zip(dir_path, file_name) -> Iterator:
    file_path = pjoin(dir_path, file_name)
    if not os.path.exists(file_path):
        print("extracting from zip...")
        zip_file_path = file_path + ".zip"
        extract_zip_file_at(zip_file_path, dir_path)

    f = open(file_path, "r", encoding="utf-8", errors="ignore")
    reader = csv.reader(f, delimiter='\t')

    for idx, row in enumerate(reader):
        yield row
Exemplo n.º 23
0
    def __init__(self, word_list: List[str], out_path):
        self.out_dir = out_path
        tokenizer = get_tokenizer()
        self.seq_set: List[List[int]] = []
        self.input_dir = pjoin(sydney_working_dir,
                               FileName("alt_emb_heavy_metal"))

        for word in word_list:
            subwords = tokenizer.tokenize(word)
            ids = tokenizer.convert_tokens_to_ids(subwords)
            print(subwords, ids)
            self.seq_set.append(ids)
Exemplo n.º 24
0
def print_features():
    job_dir = "perspective_paragraph_feature"
    job_id = 0
    file_path = os.path.join(sydney_working_dir, job_dir, str(job_id))

    features: List[ParagraphClaimPersFeature] = pickle.load(
        open(os.path.join(file_path), "rb"))
    features: List[ParagraphFeature] = lmap(to_paragraph_feature, features)

    out_path = pjoin(output_path,
                     FileName("perspective_paragraph_feature.html"))
    print_paragraph_feature(features, out_path)
Exemplo n.º 25
0
def predict_by_para_scorer(score_pred_file_name: FileName,
                           cpid_resolute_file: FileName,
                           claims,
                           top_k) -> List[Tuple[str, List[Dict]]]:
    suc_count = SuccessCounter()
    suc_count.reset()

    pred_path: FilePath = pjoin(output_path, score_pred_file_name)
    print("Loading cpid_resolute")
    cpid_resolute: Dict[str, CPID] = load_cpid_resolute(cpid_resolute_file)
    print("Loading paragraph triple scores")
    score_d: Dict[CPID, float] = get_cpid_score_from_cache_or_raw(pred_path, cpid_resolute, "avg")

    per_claim_suc = {}
    per_claim_counter = {}

    def scorer(lucene_score, query_id):
        claim_id, p_id = query_id.split("_")
        if claim_id not in per_claim_suc:
            per_claim_counter[claim_id] = Counter()
            per_claim_suc[claim_id] = SuccessCounter()

        if query_id in score_d:
            cls_score = score_d[query_id]
            per_claim_suc[claim_id].suc()
            if cls_score > 0.8:
                per_claim_counter[claim_id][1] += 1
            elif cls_score < 0.3:
                per_claim_counter[claim_id][0] += 1
            suc_count.suc()
        else:
            cls_score = 0.5
            per_claim_suc[claim_id].fail()
            suc_count.fail()

        score = 0.9 * cls_score + 0.1 * lucene_score / 20
        return score

    r = predict_interface(claims, top_k, scorer)
    for claim in per_claim_suc:
        suc_counter = per_claim_suc[claim]
        print("{} suc/total={}/{}  True/False={}/{}".format(
            claim, suc_counter.get_suc(), suc_counter.get_total(),
            per_claim_counter[claim][1], per_claim_counter[claim][0]
        ))

    print("{} found of {}".format(suc_count.get_suc(), suc_count.get_total()))
    return r
Exemplo n.º 26
0
def main():
    train_queries, test_queries = get_query_split()
    out_dir = pjoin(output_path, "eHealth")
    exist_or_mkdir(out_dir)
    ranked_list_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/CLEF_eHealth_working/ranked_list_filtered"
    )
    ranked_list: RankedListDict = load_galago_ranked_list(ranked_list_path)
    qrels = load_clef_qrels()

    new_d = {}
    for query in test_queries:
        new_d[query.qid] = ranked_list[query.qid]

    save_path = os.path.join(out_dir, 'test_baseline.list')
    write_ranked_list_from_d(new_d, save_path)
Exemplo n.º 27
0
def eval(
    score_pred_file_name: FileName,
    cpid_resolute_file: FileName,
    n_way=3,
):
    topic = "abortion"
    pred_path: FilePath = pjoin(output_path, score_pred_file_name)
    dpid_resolute: Dict[str, DPID] = load_dpid_resolute(cpid_resolute_file)
    score_d: Dict[DPID,
                  np.ndarray] = get_datapoint_score(pred_path, dpid_resolute,
                                                    "avg")

    def argmax(arr: np.ndarray) -> int:
        return arr.argmax()

    pred_d: Dict[DPID, int] = dict_value_map(argmax, score_d)

    dev_labels = get_dev_labels(topic)
    if n_way == 2:

        def merge_label(e):
            dpid, label = e
            return dpid, {
                0: 0,
                1: 1,
                2: 1,
            }[label]

        dev_labels = lmap(merge_label, dev_labels)

    def fetch_pred(e: Tuple[DPID, int]):
        dpid, label = e
        pred = pred_d[dpid]
        return pred

    gold_list: List[int] = right(dev_labels)
    pred_list: List[int] = lmap(fetch_pred, dev_labels)
    if n_way == 3:
        all_result = eval_3label(gold_list, pred_list)
    elif n_way == 2:
        all_result = eval_2label(gold_list, pred_list)
    else:
        assert False
    print(all_result)
    f1 = sum([result['f1'] for result in all_result]) / n_way
    print("Avg F1 : ", f1)
Exemplo n.º 28
0
def collect_save_relevance_score():
    prediction_file = pjoin(output_path, "pc_rel")

    info_d = load_from_pickle("pc_rel_info_all")
    print("Building twostepdict")
    #two_step_d = TwoStepDict(info_d)

    # info_list = list(info_d.items())
    # info_list.sort(key=lambda x: x[0])
    # idx = 0
    # for a, b in info_list:
    #     print(a)
    #     assert idx == a
    #     idx += 1
    print("Collect pc_rel")

    relevance_scores: Dict[CPIDPair, List[Tuple[Logits, Logits]]] = collect_pc_rel_score(prediction_file, info_d)
    save_to_pickle(relevance_scores, "pc_relevance_score")
Exemplo n.º 29
0
def main():
    queries = load_queries()
    bm25_path = pjoin(cord_working_dir, "youngwoo_bm25_query")
    ranked_list:  Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(bm25_path)
    out_path = os.path.join(cord_working_dir, "tfrecord_2_4")
    max_seq_length = 512

    meat_data: List[Dict] = read_csv_as_dict(meta_data_path)

    text_dict = {}
    for e in meat_data:
        text_dict[e[str_cord_uid]] = e[str_title] + " " + e[str_abstract]

    def get_text_from_doc_id(doc_id:str) -> str:
        return text_dict[doc_id]

    data_info_save_name = "data_info_save"
    tf_record_gen(ranked_list, queries, get_text_from_doc_id, out_path, max_seq_length, data_info_save_name)
Exemplo n.º 30
0
def main():
    info = load_from_pickle("pc_rel_dev_info_all")
    prediction_path = pjoin(output_path, "pc_rel_dev")
    rel_info: Dict[DataID,
                   Tuple[CPIDPair, Logits,
                         Logits]] = load_from_pickle("pc_rel_dev_with_cpid")
    #rel_info: Dict[DataID, Tuple[CPIDPair, Logits, Logits]] = combine_pc_rel_with_cpid(prediction_path, info)

    doc_index = reverse_index(rel_info)
    tokenizer = get_tokenizer()

    while True:
        s = input()
        os.system('cls')
        cid, pid = s.split()
        cid = int(cid)
        pid = int(pid)
        cpid = CPIDPair((cid, pid))
        do_print(cpid, doc_index, tokenizer)