Exemplo n.º 1
0
def start_generate_jobs_for_train_val(generator: InstanceGenerator,
                                      name_prefix):
    # claim ids split to train/val
    print("Loading data ....")
    d_ids: List[int] = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    train, val = split_7_3(claims)

    train_cids = {str(t['cId']) for t in train}
    val_cids = {str(t['cId']) for t in val}
    qk_candidate: List[QKUnit] = load_qk_candidate_train()
    print("Generate instances : train")
    qk_candidate_train: List[QKUnit] = list(
        [qk for qk in qk_candidate if qk[0].query_id in train_cids])
    qk_candidate_val = list(
        [qk for qk in qk_candidate if qk[0].query_id in val_cids])

    def worker_factory(out_dir):
        return QCKWorker(qk_candidate_train, generator, out_dir)

    runner = JobRunner(job_man_dir, 378, name_prefix + "_train",
                       worker_factory)
    runner.start()

    print("Generate instances : val")

    def worker_factory(out_dir):
        return QCKWorker(qk_candidate_val, generator, out_dir)

    runner = JobRunner(job_man_dir, 162, name_prefix + "_val", worker_factory)
    runner.start()
Exemplo n.º 2
0
def start_generate_jobs_for_train_val(
        generator_functor: Callable[[Dict[int, List[Tuple[List[str], float]]]],
                                    CPPNCGeneratorInterface], writer,
        name_prefix):
    # claim ids split to train/val
    d_ids: List[int] = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    train, val = split_7_3(claims)
    data = load_from_pickle("pc_train_a_passages")
    entries, all_passages = data
    cid_to_passages: Dict[int, List[Tuple[List[str], float]]] = {
        claim['cId']: p
        for claim, p in entries
    }
    generator = generator_functor(cid_to_passages)

    print("Generate instances : train")

    def worker_factory(out_dir):
        return CPPNCWorker(train, generator, writer, out_dir)

    runner = JobRunner(job_man_dir, 378, name_prefix + "_train",
                       worker_factory)
    runner.start()

    print("Generate instances : val")

    def worker_factory(out_dir):
        return CPPNCWorker(val, generator, writer, out_dir)

    runner = JobRunner(job_man_dir, 162, name_prefix + "_val", worker_factory)
    runner.start()
Exemplo n.º 3
0
def write_claim_perspective_pair_as_query():
    split = "dev"
    assert split in ["train", "dev", "test"]

    d_ids = list({
        "train": load_train_claim_ids(),
        "dev": load_dev_claim_ids(),
        "test": load_test_claim_ids()
    }[split])
    claims = get_claims_from_ids(d_ids)
    print(len(claims), " claims")
    is_train = split == "train"
    all_data_points = get_candidates(claims, is_train)
    k = 0

    def get_query_entry_from_data_point(x: PerspectiveCandidate) -> DocQuery:
        tokens = clean_tokenize_str_to_tokens(x.claim_text + " " + x.p_text)
        qid = "{}_{}".format(x.cid, x.pid)
        return format_query_bm25(qid, tokens, k)

    queries = lmap(get_query_entry_from_data_point, all_data_points)

    out_dir = query_dir_format.format(split)
    exist_or_mkdir(out_dir)
    n_query_per_file = 50

    write_queries_to_files(n_query_per_file, out_dir, queries)
Exemplo n.º 4
0
def work():
    d_ids = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    is_train = True
    all_data_points = get_candidates(claims, is_train)
    all_data_points = all_data_points[:10]
    binary_feature_demo(all_data_points)
Exemplo n.º 5
0
def sum_random_walk_score(name_class):
    d_ids: List[int] = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    claim_d = claims_to_dict(claims)

    prob_score_d = load_from_pickle("pc_{}_word_prob_train".format(name_class))
    stopwords = load_stopwords()
    acc_counter_prob_init = Counter()
    for claim_id, prob_scores in prob_score_d.items():
        for k, v in prob_scores:
            if k not in stopwords:
                acc_counter_prob_init[k] += v

    rw_score = dict(load_from_pickle("bias_random_walk_train_{}".format(name_class)))
    acc_counter = Counter()
    for claim_id, qtf in rw_score.items():
        for k, v in qtf.items():
            acc_counter[k] += v

    acc_counter_prob_init = normalize_counter_to_sum1(acc_counter_prob_init)
    acc_counter = normalize_counter_to_sum1(acc_counter)

    new_counter = Counter()
    for k, v in acc_counter.items():
        if len(k) > 2:
            new_v = v - acc_counter_prob_init[k]
            new_counter[k] = new_v

    return new_counter
Exemplo n.º 6
0
def save_train():
    save_name = "pc_train_a_passages"
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    a_relevant(save_name, q_res_path, claims)
Exemplo n.º 7
0
def main():
    args = parser.parse_args(sys.argv[1:])
    save_name = args.save_name
    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    candidate_perspectives: Dict[int, List[Dict]] = dict(
        get_eval_candidates_from_pickle("train"))
    make_cppnc_dummy_problem(claims, candidate_perspectives, save_name,
                             encode_two_inputs)
Exemplo n.º 8
0
def a_relevant():
    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    claims = claims[:10]
    top_n = 100
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)

    stopwords = load_stopwords_for_query()
    alpha = 0.7

    tokenizer = PCTokenizer()
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        docs = []
        for i in range(top_n):
            try:
                doc = load_doc(q_res[i].doc_id)
                docs.append(doc)
            except KeyError:
                docs.append(None)
                pass

        print(c['text'])
        rows = []
        for rank, doc in enumerate(docs):
            if doc is None:
                rows.append((rank, "-", "-"))
                continue

            scores = get_doc_score(doc, get_passage_score)
            avg_score = average(scores)
            max_score = max(scores)
            rows.append((rank, avg_score, max_score))

        print_table(rows)
Exemplo n.º 9
0
def run_write_claims_as_plain_query():
    for claim_ids, out_name in [
        (load_train_claim_ids(), "train_claim_query_raw.txt"),
        (load_dev_claim_ids(), "dev_claim_query_raw.txt")
    ]:
        claims = get_claims_from_ids(claim_ids)
        q_str_list = get_claims_as_plain_query(claims)
        f = open(pjoin(output_path, out_name), "w")
        for s in q_str_list:
            f.write(s + "\n")
Exemplo n.º 10
0
def a_relevant():
    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claims = claims
    top_n = 10
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords = load_stopwords_for_query()
    alpha = 0.3

    tokenizer = PCTokenizer()
    all_passages = []
    entries = []
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        claim_text = c['text']
        claim_tokens = tokenizer.tokenize_stem(claim_text)

        scores = []
        for t in claim_tokens:
            if t in log_odd:
                scores.append(log_odd[t])
        base = average(scores)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        passages = iterate_passages(q_res, top_n, get_passage_score)

        all_passages.extend(passages)
        a_rel_passages = lfilter(lambda x: x[1] > 0, passages)

        entries.append((c, a_rel_passages))

    data = entries, all_passages

    save_to_pickle(data, "pc_train_a_passages")
Exemplo n.º 11
0
def save_bm25_as_trec_format():
    d_ids: List[int] = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 200
    candidate_dict: List[Tuple[int, List[int]]] = get_eval_candidates_w_q_text(
        claim_as_query(claims), top_k)
    pred = predict_by_bm25_from_candidate(get_bm25_module(), claims,
                                          candidate_dict, top_k)
    entries = prediction_to_trec_format(pred, "bm25")
    write_trec_ranked_list_entry(
        entries, os.path.join(output_path, "ranked_list", "bm25.txt"))
Exemplo n.º 12
0
def show_missing():
    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claims = claims[:10]
    top_n = 100
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)

    preload_docs(ranked_list, claims, top_n)
    report_missing(claims, ranked_list, top_n)
Exemplo n.º 13
0
def write_claim_queries_k0():
    def write(claim_ids, split_name):
        claims = get_claims_from_ids(claim_ids)
        queries = get_claims_query(claims, True)
        out_path = os.path.join(
            output_path,
            "perspective_{}_claim_query_k0.json".format(split_name))
        save_queries_to_file(queries, out_path)

    claim_ids, split_name = (load_train_claim_ids(), "train")

    write(claim_ids, split_name)
    claim_ids, split_name = (load_dev_claim_ids(), "dev")
    write(claim_ids, split_name)
Exemplo n.º 14
0
def write_claim_as_query():
    d_ids = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    queries = []
    for c in claims:
        cid = c["cId"]
        claim_text = c["text"]
        tokens = claim_text.split()
        query_text = clean_query(tokens)
        print(query_text)
        q_entry = get_query_entry_bm25_anseri(cid, query_text)
        queries.append(q_entry)

    out_path = os.path.join(output_path, "perspective_dev_claim_query.json")
    save_queries_to_file(queries, out_path)
Exemplo n.º 15
0
def work():
    claim_ids, split_name = (load_train_claim_ids(), "train")
    print("Num claims in train : ", len(list(claim_ids)))

    exit()

    def submit_jobs_inner(claim_ids, split_name):
        claims = get_claims_from_ids(claim_ids)
        queries = get_claims_query(claims)
        out_root = "/mnt/nfs/work3/youngwookim/data/perspective/{}_claim_rm3".format(
            split_name)
        exist_or_mkdir(out_root)
        submit_rm_jobs(queries, out_root)

    claim_ids, split_name = (load_dev_claim_ids(), "dev")
    submit_jobs_inner(claim_ids, split_name)
Exemplo n.º 16
0
def run_get_claim_term_weighting():
    # Load claim
    # Do dependency parsing
    # show top level roles ( dobj, sub )
    param = {'k1': 0.5}
    d_ids: List[int] = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    out_d = get_claim_term_weighting(claims, param)
    nlp = spacy.load("en_core_web_sm")

    for c in claims:
        weight = out_d[c['cId']]
        for token in nlp(c['text']):
            s = token.text
            if weight[s] > 1:
                s = "[{}]".format(s)
            print(s, end=" ")
        print()
Exemplo n.º 17
0
def main():
    print("Loading data ....")
    d_ids: List[int] = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    train, val = split_7_3(claims)

    val_cids = {str(t['cId']) for t in val}

    qk_candidate: List[QKUnit] = load_qk_candidate_train()
    qk_candidate_val = list(
        [qk for qk in qk_candidate if qk[0].query_id in val_cids])

    print(qk_candidate_val[0][0])

    for q, kdp_list in qk_candidate_val[1:9]:
        job_id = request_kdp_eval(kdp_list)
        print('qid:', q.query_id)
        print('job_id', job_id)
Exemplo n.º 18
0
def load_train_claim_d():
    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claim_d = {c['cId']: c['text'] for c in claims}
    return claim_d
Exemplo n.º 19
0
def work():
    split = "train"
    assert split in ["train", "dev", "test"]

    tokenizer = PCTokenizer()
    d_ids = list({
        "train": load_train_claim_ids(),
        "dev": load_dev_claim_ids(),
        "test": load_test_claim_ids()
    }[split])
    claims = get_claims_from_ids(d_ids)
    claim_d = claims_to_dict(claims)

    print(len(claims), " claims")
    do_balance = False
    all_data_points: List[PerspectiveCandidate] = get_candidates(
        claims, do_balance)

    grouped: Dict[str, List] = group_by(all_data_points, lambda x: x.cid)

    def get_frequency_per_class(datapoints: List[PerspectiveCandidate]):
        pos_text = []
        neg_text = []
        for dp in datapoints:
            tokens = tokenizer.tokenize_stem(dp.p_text)
            tf = Counter(tokens)
            dl = sum(tf.values())
            tf_rel = {k: v / dl for k, v in tf.items()}

            if dp.label == "1":
                pos_text.append(tf_rel)
            elif dp.label == "0":
                neg_text.append(tf_rel)
            else:
                assert False

        def accumulate(tf_list: List[Dict]):
            out_c = Counter()
            n = len(tf_list)
            for tf in tf_list:
                for k, v in tf.items():
                    out_c[k] += v / n

            return out_c

        pos_avg_tf = accumulate(pos_text)
        neg_avg_tf = accumulate(neg_text)
        return pos_avg_tf, neg_avg_tf

    class_freq: Dict[str,
                     Tuple[Counter,
                           Counter]] = dict_value_map(get_frequency_per_class,
                                                      grouped)

    save_to_pickle(class_freq, "per_claim_class_word_tf_{}".format(split))

    def normalize(s_list: List[float]) -> List[float]:
        m = sum(s_list)
        return list([s / m for s in s_list])

    pos_prob_dict = {}
    neg_prob_dict = {}

    for cid, info in class_freq.items():
        pos, neg = info
        all_words = set(pos.keys())
        all_words.update(neg.keys())

        info = []
        for word in all_words:
            score = pos[word] - neg[word]
            info.append((word, score))

        pos_scores = list([(w, s) for w, s in info if s > 0])
        neg_scores = list([(w, s) for w, s in info if s < 0])

        def normalize_right(pair_list):
            right_scores = normalize(right(pair_list))
            return list(zip(left(pair_list), right_scores))

        pos_prob_dict[cid] = normalize_right(pos_scores)
        neg_prob_dict[cid] = normalize_right(neg_scores)

    save_to_pickle(pos_prob_dict, "pc_pos_word_prob_{}".format(split))
    save_to_pickle(neg_prob_dict, "pc_neg_word_prob_{}".format(split))
Exemplo n.º 20
0
def main():
    d_ids = list(load_train_claim_ids())
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    save_name = "pc_token_train"
    return do_datagen(d_ids, q_res_path, save_name)
Exemplo n.º 21
0
def join_docs_and_lm():
    gold = get_claim_perspective_id_dict()

    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claims = claims[:10]
    top_n = 10
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords.update([".", ",", "!", "?"])

    alpha = 0.1

    html_visualizer = HtmlVisualizer("doc_lm_joined.html")

    def get_cell_from_token2(token, probs):
        if token.lower() in stopwords:
            probs = 0
        probs = probs * 1e5
        s = min(100, probs)
        c = Cell(token, s)
        return c

    tokenizer = PCTokenizer()
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        html_visualizer.write_headline("{} : {}".format(c['cId'], c['text']))

        clusters: List[List[int]] = gold[c['cId']]

        for cluster in clusters:
            html_visualizer.write_paragraph("---")
            p_text_list: List[str] = lmap(perspective_getter, cluster)
            for text in p_text_list:
                html_visualizer.write_paragraph(text)
            html_visualizer.write_paragraph("---")
        claim_lm = claim_lms_d[c['cId']]
        topic_lm_prob = smooth(claim_lm.LM, bg_lm, alpha)
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        s = "\t".join(left(log_odd.most_common(30)))
        html_visualizer.write_paragraph("Log odd top: " + s)
        not_found = set()

        def get_log_odd(x):
            x = tokenizer.stemmer.stem(x)
            if x not in log_odd:
                not_found.add(x)
            return log_odd[x]

        def get_probs(x):
            x = tokenizer.stemmer.stem(x)
            if x not in topic_lm_prob:
                not_found.add(x)
            return topic_lm_prob[x]

        for i in range(top_n):
            try:
                doc = load_doc(q_res[i].doc_id)
                cells = lmap(lambda x: get_cell_from_token(x, get_log_odd(x)),
                             doc)
                html_visualizer.write_headline("Doc rank {}".format(i))
                html_visualizer.multirow_print(cells, width=20)
            except KeyError:
                pass
        html_visualizer.write_paragraph("Not found: {}".format(not_found))
Exemplo n.º 22
0
def train_split():
    d_ids: List[int] = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    train, val = split_7_3(claims)
    return claims, val
Exemplo n.º 23
0
def doc_lm_scoring():
    gold = get_claim_perspective_id_dict()

    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claims = claims
    top_n = 10
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords = load_stopwords_for_query()
    alpha = 0.5

    html_visualizer = HtmlVisualizer("doc_lm_doc_level.html")

    tokenizer = PCTokenizer()
    random_passages = []
    num_pos_sum = 0
    num_pos_exists = 0
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        html_visualizer.write_headline("{} : {}".format(c['cId'], c['text']))
        # for cluster in clusters:
        #     html_visualizer.write_paragraph("---")
        #     p_text_list: List[str] = lmap(perspective_getter, cluster)
        #     for text in p_text_list:
        #         html_visualizer.write_paragraph(text)
        #     html_visualizer.write_paragraph("---")
        claim_lm = claim_lms_d[c['cId']]
        topic_lm_prob = smooth(claim_lm.LM, bg_lm, alpha)
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        claim_text = c['text']
        claim_tokens = tokenizer.tokenize_stem(claim_text)

        scores = []
        for t in claim_tokens:
            if t in log_odd:
                scores.append(log_odd[t])
        threshold = average(scores)

        s = "\t".join(left(log_odd.most_common(30)))
        html_visualizer.write_paragraph("Log odd top: " + s)
        not_found = set()

        def get_log_odd(x):
            x = tokenizer.stemmer.stem(x)
            if x not in log_odd:
                not_found.add(x)
            return log_odd[x]

        def get_probs(x):
            x = tokenizer.stemmer.stem(x)
            if x not in topic_lm_prob:
                not_found.add(x)
            return topic_lm_prob[x]

        def get_passage_score(p):
            return sum([log_odd[tokenizer.stemmer.stem(t)]
                        for t in p]) / len(p) if len(p) > 0 else 0

        passages = iterate_passages(q_res, top_n, get_passage_score)

        passages.sort(key=lambda x: x[1], reverse=True)
        html_visualizer.write_paragraph("Threshold {}".format(threshold))

        top5_scores = right(passages[:5])
        bot5_scores = right(passages[-5:])

        if len(random_passages) > 5:
            random_sel_pssages = random.choices(random_passages, k=5)
        else:
            random_sel_pssages = []
        random5_scores = lmap(get_passage_score, random_sel_pssages)

        def score_line(scores):
            return " ".join(lmap(two_digit_float, scores))

        html_visualizer.write_paragraph("top 5: " + score_line(top5_scores))
        html_visualizer.write_paragraph("bot 5: " + score_line(bot5_scores))
        html_visualizer.write_paragraph("random 5: " +
                                        score_line(random5_scores))

        num_pos = len(lfilter(lambda x: x[1] > 0, passages))
        num_pos_sum += num_pos
        if num_pos > 0:
            num_pos_exists += 1

        def print_doc(doc, html_visualizer, score):
            cells = lmap(lambda x: get_cell_from_token(x, get_log_odd(x)), doc)
            html_visualizer.write_headline("score={}".format(score))
            html_visualizer.multirow_print(cells, width=20)

        random_passages.extend(left(passages))
        if threshold < 0:
            continue
        for doc, score in passages:
            if score < 0:
                break
            print_doc(doc, html_visualizer, score)

        html_visualizer.write_headline("Bottom 5")
        for doc, score in passages[-5:]:
            print_doc(doc, html_visualizer, score)

    print("{} claims. {} docs on {} claims".format(len(claims), num_pos_sum,
                                                   num_pos_exists))