示例#1
0
def main():
    docs: Dict[str, List[List[str]]] = load_from_pickle("dev_claim_docs")
    _, clue12_13_df = load_clueweb12_B13_termstat()
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    r = select_paragraph(docs, clue12_13_df, claims, "topk")
    save_to_pickle(r, "dev_claim_paras")
示例#2
0
 def __init__(self, split, q_config_id, out_dir):
     self.out_dir = out_dir
     self.ci = StaticRankedListInterface(q_config_id)
     print("load__data_point")
     self.all_data_points = load_data_point(split)
     print("Load term stat")
     _, clue12_13_df = load_clueweb12_B13_termstat()
     self.clue12_13_df = clue12_13_df
     self.tokenizer = get_tokenizer()
示例#3
0
    def __init__(self, option, out_dir):
        self.out_dir = out_dir
        self.ci = RankedListInterface()
        print("load__data_point")
        self.all_data_points: List[TPDataPoint] = lmap(ukp_datapoint_to_tp_datapoint, load_all_data_flat())
        self.data_step_size = 50

        total_jobs = ceil_divide(len(self.all_data_points), self.data_step_size)
        print("total_jobs :", total_jobs )
        print("Load term stat")
        _, clue12_13_df = load_clueweb12_B13_termstat()
        self.clue12_13_df = clue12_13_df
        self.dp_id_to_q_res_id_fn = build_dp_id_to_q_res_id_fn()
        self.tokenizer = get_tokenizer()
        self.option = option
示例#4
0
def count_term_stat(doc_list, unigrams):
    # count term frequency
    # t \in unigrams
    # df(controversy, t),  df(t)
    # df_clueweb(controversy), df_clueweb(t)

    tf_cont = Counter()
    tf_ncont = Counter()
    ctf_cont = 0
    ctf_ncont = 0
    df_cont = Counter()
    df_ncont = Counter()
    cdf_cont = 0
    cdf_ncont = 0
    clueweb_tf, clueweb_df = load_clueweb12_B13_termstat()
    clueweb_ctf = sum(clueweb_tf.values())
    clueweb_cdf = max(clueweb_df.values()) + 100

    def get_tf(doc, t):
        return doc['tf_d'][t]

    def contain_controversy(doc):
        return 'controversy' in doc['tokens_set'] or 'controversial' in doc[
            'tokens_set']

    def contain(doc, t):
        return t in doc['tokens_set']

    for doc in doc_list:
        current_doc_contain_controversy = contain_controversy(doc)
        for t in unigrams:
            if contain(doc, t):
                if current_doc_contain_controversy:
                    tf_cont[t] += get_tf(doc, t)
                    df_cont[t] += 1
                else:
                    tf_ncont[t] += get_tf(doc, t)
                    df_ncont[t] += 1

        if current_doc_contain_controversy:
            ctf_cont += doc['dl']
            cdf_cont += 1
        else:
            ctf_ncont += doc['dl']
            cdf_ncont += 1
    return cdf_cont, cdf_ncont, clueweb_cdf, clueweb_ctf, clueweb_df, clueweb_tf, ctf_cont, ctf_ncont, df_cont, df_ncont, tf_cont, tf_ncont
示例#5
0
def build_binary_feature(ci: DynRankedListInterface,
                         datapoint_list: List[PerspectiveCandidate]
                         ) -> List[Dict]:
    not_found_set = set()
    print("Load term stat")
    _, clue12_13_df = load_clueweb12_B13_termstat()
    cdf = 50 * 1000 * 1000

    def idf_scorer(doc: Counter, claim_text: str, perspective_text: str) -> bool:
        cp_tokens = nltk.word_tokenize(claim_text) + nltk.word_tokenize(perspective_text)
        cp_tokens = lmap(lambda x: x.lower(), cp_tokens)
        cp_tokens = set(cp_tokens)
        mentioned_terms = lfilter(lambda x: x in doc, cp_tokens)
        mentioned_terms = re_tokenize(mentioned_terms)

        def idf(term: str):
            if term not in clue12_13_df:
                if term in string.printable:
                    return 0
                not_found_set.add(term)

            return math.log((cdf+0.5)/(clue12_13_df[term]+0.5))

        score = sum(lmap(idf, mentioned_terms))
        max_score = sum(lmap(idf, cp_tokens))
        return score > max_score * 0.8

    def data_point_to_feature(x: PerspectiveCandidate) -> Dict:
        e = get_feature_binary_model(x.cid, x.pid, x.claim_text, x.p_text,
                                     ci, idf_scorer)
        feature: Counter = e[0]
        num_metion: int = e[1]
        return {
            'feature': feature,
            'cid': x.cid,
            'pid': x.pid,
            'num_mention': num_metion,
            'label': x.label
            }

    r = lmap(data_point_to_feature, datapoint_list)
    return r
示例#6
0
def binary_feature_demo(datapoint_list):
    ci = PassageRankedListInterface(make_passage_query, Q_CONFIG_ID_BM25)
    not_found_set = set()
    _, clue12_13_df = load_clueweb12_B13_termstat()
    cdf = 50 * 1000 * 1000
    html = HtmlVisualizer("pc_binary_feature.html")

    def idf_scorer(doc, claim_text, perspective_text):
        cp_tokens = nltk.word_tokenize(claim_text) + nltk.word_tokenize(
            perspective_text)
        cp_tokens = lmap(lambda x: x.lower(), cp_tokens)
        cp_tokens = set(cp_tokens)
        mentioned_terms = lfilter(lambda x: x in doc, cp_tokens)
        mentioned_terms = re_tokenize(mentioned_terms)

        def idf(term):
            if term not in clue12_13_df:
                if term in string.printable:
                    return 0
                not_found_set.add(term)

            return math.log((cdf + 0.5) / (clue12_13_df[term] + 0.5))

        score = sum(lmap(idf, mentioned_terms))
        max_score = sum(lmap(idf, cp_tokens))
        # print(claim_text, perspective_text)
        # print(mentioned_terms)
        # print(score, max_score)
        return score, max_score, mentioned_terms

    def bm25_estimator(doc: Counter, claim_text: str, perspective_text: str):
        cp_tokens = nltk.word_tokenize(claim_text) + nltk.word_tokenize(
            perspective_text)
        cp_tokens = lmap(lambda x: x.lower(), cp_tokens)
        k1 = 0

        def BM25_3(f, qf, df, N, dl, avdl) -> float:
            K = compute_K(dl, avdl)
            first = math.log((N - df + 0.5) / (df + 0.5))
            second = ((k1 + 1) * f) / (K + f)
            return first * second

        dl = sum(doc.values())
        info = []
        for q_term in set(cp_tokens):
            if q_term in doc:
                score = BM25_3(doc[q_term], 0, clue12_13_df[q_term], cdf, dl,
                               1200)
                info.append((q_term, doc[q_term], clue12_13_df[q_term], score))
        return info

    print_cnt = 0
    for dp_idx, x in enumerate(datapoint_list):
        ranked_list: List[GalagoRankEntry] = ci.query_passage(
            x.cid, x.pid, x.claim_text, x.p_text)
        html.write_paragraph(x.claim_text)
        html.write_paragraph(x.p_text)
        html.write_paragraph("{}".format(x.label))

        local_print_cnt = 0
        lines = []
        for ranked_entry in ranked_list:
            try:
                doc_id = ranked_entry.doc_id
                galago_score = ranked_entry.score

                tokens = load_doc(doc_id)
                doc_tf = Counter(tokens)
                if doc_tf is not None:
                    score, max_score, mentioned_terms = idf_scorer(
                        doc_tf, x.claim_text, x.p_text)
                    matched = score > max_score * 0.75
                else:
                    matched = "Unk"
                    score = "Unk"
                    max_score = "Unk"

                def get_cell(token):
                    if token in mentioned_terms:
                        return Cell(token, highlight_score=50)
                    else:
                        return Cell(token)

                line = [doc_id, galago_score, matched, score, max_score]
                lines.append(line)
                html.write_paragraph("{0} / {1:.2f}".format(
                    doc_id, galago_score))
                html.write_paragraph("{}/{}".format(score, max_score))
                bm25_info = bm25_estimator(doc_tf, x.claim_text, x.p_text)
                bm25_score = sum(lmap(lambda x: x[3], bm25_info))
                html.write_paragraph(
                    "bm25 re-estimate : {}".format(bm25_score))
                html.write_paragraph("{}".format(bm25_info))
                html.multirow_print(lmap(get_cell, tokens))
                local_print_cnt += 1
                if local_print_cnt > 10:
                    break
            except KeyError:
                pass

        matched_idx = idx_where(lambda x: x[2], lines)
        if not matched_idx:
            html.write_paragraph("No match")
        else:
            last_matched = matched_idx[-1]
            lines = lines[:last_matched + 1]
            rows = lmap(lambda line: lmap(Cell, line), lines)
            html.write_table(rows)

        if dp_idx > 10:
            break