Пример #1
0
def a_relevant(save_name, q_res_path, claims):
    top_n = 10

    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords = load_stopwords_for_query()
    alpha = 0.5

    tokenizer = PCTokenizer()
    all_passages = []
    entries = []
    num_pos_sum = 0
    num_pos_exists = 0

    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        claim_text = c['text']
        claim_tokens = tokenizer.tokenize_stem(claim_text)

        scores = []
        for t in claim_tokens:
            if t in log_odd:
                scores.append(log_odd[t])

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        passages = iterate_passages(q_res, top_n, get_passage_score)
        num_pos = len(lfilter(lambda x: x[1] > 0, passages))
        num_pos_sum += num_pos
        if num_pos > 0:
            num_pos_exists += 1

        all_passages.extend(passages)
        entries.append((c, passages))

    print("{} claims. {} docs on {} claims".format(len(claims), num_pos_sum,
                                                   num_pos_exists))

    data = entries, all_passages

    save_to_pickle(data, save_name)
Пример #2
0
def a_relevant():
    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    claims = claims[:10]
    top_n = 100
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)

    stopwords = load_stopwords_for_query()
    alpha = 0.7

    tokenizer = PCTokenizer()
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        docs = []
        for i in range(top_n):
            try:
                doc = load_doc(q_res[i].doc_id)
                docs.append(doc)
            except KeyError:
                docs.append(None)
                pass

        print(c['text'])
        rows = []
        for rank, doc in enumerate(docs):
            if doc is None:
                rows.append((rank, "-", "-"))
                continue

            scores = get_doc_score(doc, get_passage_score)
            avg_score = average(scores)
            max_score = max(scores)
            rows.append((rank, avg_score, max_score))

        print_table(rows)
Пример #3
0
def a_relevant():
    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claims = claims
    top_n = 10
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords = load_stopwords_for_query()
    alpha = 0.3

    tokenizer = PCTokenizer()
    all_passages = []
    entries = []
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        claim_text = c['text']
        claim_tokens = tokenizer.tokenize_stem(claim_text)

        scores = []
        for t in claim_tokens:
            if t in log_odd:
                scores.append(log_odd[t])
        base = average(scores)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        passages = iterate_passages(q_res, top_n, get_passage_score)

        all_passages.extend(passages)
        a_rel_passages = lfilter(lambda x: x[1] > 0, passages)

        entries.append((c, a_rel_passages))

    data = entries, all_passages

    save_to_pickle(data, "pc_train_a_passages")
Пример #4
0
def main():
    split = "train"
    subjectivity_path = sys.argv[1]
    q_res_path = sys.argv[2]
    ranked_list: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)

    # load LM
    claim_lms: List[ClaimLM] = build_gold_lms_for_sub_split(split)
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)
    alpha = 0.1
    stopwords = load_stopwords_for_query()
    # load subjectivity predictions.
    subj_d: Dict[str, Tuple[int, int]] = load_subjectivity(subjectivity_path)
    doc_ids = subj_d.keys()
    preload_man.preload(TokenizedCluewebDoc, doc_ids)
    tokenizer = PCTokenizer()

    lm_scores = []
    rates = []
    num_subj_list = []
    num_sent_list = []
    for claim_lm in claim_lms:
        qid = str(claim_lm.cid)
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        for entry in ranked_list[qid]:
            if entry.doc_id in subj_d:
                tokens = load_doc(entry.doc_id)
                assert type(tokens[0]) == str
                lm_score = get_passage_score(tokens)
                num_subj, num_sent = subj_d[entry.doc_id]
                rate = num_subj / num_sent
                lm_scores.append(lm_score)
                rates.append(rate)
                num_subj_list.append(num_subj)
                num_sent_list.append(num_sent)



    print("lm scores correlation with ")
    print("rates: ", pearsonr(lm_scores, rates))
    print("num subj: ", pearsonr(lm_scores, num_subj_list))
    print("num sent: ", pearsonr(lm_scores, num_sent_list))
Пример #5
0
    def show(claim_lm: ClaimLM):
        print('----')
        print(claim_lm.claim)
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_bg_lm = get_lm_log(bg_lm)
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        for k, v in claim_lm.LM.most_common(50):
            print(k, v)

        s = "\t".join(left(claim_lm.LM.most_common(10)))
        print("LM freq: ", s)
        print(s)

        s = "\t".join(left(log_odd.most_common(30)))
        print("Log odd top", s)

        s = "\t".join(left(least_common(log_odd, 10)))
        print("Log odd bottom", s)
Пример #6
0
    def show(r: RelevanceModel):
        print('----')
        print(r.text)
        log_topic_lm = get_lm_log(smooth(r.lm, bg_lm, alpha))
        log_bg_lm = get_lm_log(bg_lm)
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        for k, v in r.lm.most_common(50):
            print(k, v)

        s = "\t".join(left(r.lm.most_common(10)))
        print("LM freq: ", s)
        print(s)

        s = "\t".join(left(log_odd.most_common(30)))
        print("Log odd top", s)

        s = "\t".join(left(least_common(log_odd, 10)))
        print("Log odd bottom", s)
Пример #7
0
 def __init__(self, query_lms: Dict[str, Counter], alpha=0.5):
     self.query_lms = query_lms
     bg_lm = average_counters(list(query_lms.values()))
     self.bg_lm = bg_lm
     self.log_bg_lm: Counter = get_lm_log(bg_lm)
     self.alpha = alpha
     self.log_odd_d: Dict[str, Counter] = {
         k: Counter()
         for k in query_lms.keys()
     }
     self.stopwords = load_stopwords_for_query()
     self.tokenizer = PCTokenizer()
Пример #8
0
def get_generator(max_seq_length, bg_lm, alpha):
    log_bg_lm = get_lm_log(bg_lm)
    top_n = 100
    stopwords = load_stopwords_for_query()
    fail_logger = Counter()
    bert_tokenizer = get_tokenizer()

    def generate(claim_lm: ClaimLM, ranked_list: List[SimpleRankedListEntry]):
        claim_text = claim_lm.claim
        claim_tokens = bert_tokenizer.tokenize(claim_text)
        claim_token_len = len(claim_tokens)

        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)
        doc_ids = lmap(lambda x: x.doc_id, ranked_list[:top_n])
        print("loading docs")
        preload_man.preload(BertTokenizedCluewebDoc, doc_ids)

        window_size = max_seq_length - claim_token_len - 3
        step_size = max_seq_length - 112
        enum_paragraph = enum_paragraph_functor(step_size, window_size)

        def get_record(tokens):
            scores, masks = get_target_labels(tokens, log_odd, stopwords,
                                              fail_logger)
            sum(scores)
            return Record(claim_tokens, tokens, scores, masks)

        tokens_list: List[List[str]] = []
        not_found = 0
        for doc_id in doc_ids:
            try:
                tokens: List[str] = list(
                    flatten(load(BertTokenizedCluewebDoc, doc_id)))
                tokens_list.append(tokens)
            except KeyError:
                not_found += 1
                pass

        print("{} of {} not found".format(not_found, len(tokens_list)))
        paragraph_list: Iterable[List[str]] = enum_paragraph(tokens_list)
        records: List[Record] = lmap(get_record, paragraph_list)

        return records

    return generate
Пример #9
0
def doc_lm_scoring():
    gold = get_claim_perspective_id_dict()

    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claims = claims
    top_n = 10
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords = load_stopwords_for_query()
    alpha = 0.5

    html_visualizer = HtmlVisualizer("doc_lm_doc_level.html")

    tokenizer = PCTokenizer()
    random_passages = []
    num_pos_sum = 0
    num_pos_exists = 0
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        html_visualizer.write_headline("{} : {}".format(c['cId'], c['text']))
        # for cluster in clusters:
        #     html_visualizer.write_paragraph("---")
        #     p_text_list: List[str] = lmap(perspective_getter, cluster)
        #     for text in p_text_list:
        #         html_visualizer.write_paragraph(text)
        #     html_visualizer.write_paragraph("---")
        claim_lm = claim_lms_d[c['cId']]
        topic_lm_prob = smooth(claim_lm.LM, bg_lm, alpha)
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        claim_text = c['text']
        claim_tokens = tokenizer.tokenize_stem(claim_text)

        scores = []
        for t in claim_tokens:
            if t in log_odd:
                scores.append(log_odd[t])
        threshold = average(scores)

        s = "\t".join(left(log_odd.most_common(30)))
        html_visualizer.write_paragraph("Log odd top: " + s)
        not_found = set()

        def get_log_odd(x):
            x = tokenizer.stemmer.stem(x)
            if x not in log_odd:
                not_found.add(x)
            return log_odd[x]

        def get_probs(x):
            x = tokenizer.stemmer.stem(x)
            if x not in topic_lm_prob:
                not_found.add(x)
            return topic_lm_prob[x]

        def get_passage_score(p):
            return sum([log_odd[tokenizer.stemmer.stem(t)]
                        for t in p]) / len(p) if len(p) > 0 else 0

        passages = iterate_passages(q_res, top_n, get_passage_score)

        passages.sort(key=lambda x: x[1], reverse=True)
        html_visualizer.write_paragraph("Threshold {}".format(threshold))

        top5_scores = right(passages[:5])
        bot5_scores = right(passages[-5:])

        if len(random_passages) > 5:
            random_sel_pssages = random.choices(random_passages, k=5)
        else:
            random_sel_pssages = []
        random5_scores = lmap(get_passage_score, random_sel_pssages)

        def score_line(scores):
            return " ".join(lmap(two_digit_float, scores))

        html_visualizer.write_paragraph("top 5: " + score_line(top5_scores))
        html_visualizer.write_paragraph("bot 5: " + score_line(bot5_scores))
        html_visualizer.write_paragraph("random 5: " +
                                        score_line(random5_scores))

        num_pos = len(lfilter(lambda x: x[1] > 0, passages))
        num_pos_sum += num_pos
        if num_pos > 0:
            num_pos_exists += 1

        def print_doc(doc, html_visualizer, score):
            cells = lmap(lambda x: get_cell_from_token(x, get_log_odd(x)), doc)
            html_visualizer.write_headline("score={}".format(score))
            html_visualizer.multirow_print(cells, width=20)

        random_passages.extend(left(passages))
        if threshold < 0:
            continue
        for doc, score in passages:
            if score < 0:
                break
            print_doc(doc, html_visualizer, score)

        html_visualizer.write_headline("Bottom 5")
        for doc, score in passages[-5:]:
            print_doc(doc, html_visualizer, score)

    print("{} claims. {} docs on {} claims".format(len(claims), num_pos_sum,
                                                   num_pos_exists))
Пример #10
0
def join_docs_and_lm():
    gold = get_claim_perspective_id_dict()

    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claims = claims[:10]
    top_n = 10
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords.update([".", ",", "!", "?"])

    alpha = 0.1

    html_visualizer = HtmlVisualizer("doc_lm_joined.html")

    def get_cell_from_token2(token, probs):
        if token.lower() in stopwords:
            probs = 0
        probs = probs * 1e5
        s = min(100, probs)
        c = Cell(token, s)
        return c

    tokenizer = PCTokenizer()
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        html_visualizer.write_headline("{} : {}".format(c['cId'], c['text']))

        clusters: List[List[int]] = gold[c['cId']]

        for cluster in clusters:
            html_visualizer.write_paragraph("---")
            p_text_list: List[str] = lmap(perspective_getter, cluster)
            for text in p_text_list:
                html_visualizer.write_paragraph(text)
            html_visualizer.write_paragraph("---")
        claim_lm = claim_lms_d[c['cId']]
        topic_lm_prob = smooth(claim_lm.LM, bg_lm, alpha)
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        s = "\t".join(left(log_odd.most_common(30)))
        html_visualizer.write_paragraph("Log odd top: " + s)
        not_found = set()

        def get_log_odd(x):
            x = tokenizer.stemmer.stem(x)
            if x not in log_odd:
                not_found.add(x)
            return log_odd[x]

        def get_probs(x):
            x = tokenizer.stemmer.stem(x)
            if x not in topic_lm_prob:
                not_found.add(x)
            return topic_lm_prob[x]

        for i in range(top_n):
            try:
                doc = load_doc(q_res[i].doc_id)
                cells = lmap(lambda x: get_cell_from_token(x, get_log_odd(x)),
                             doc)
                html_visualizer.write_headline("Doc rank {}".format(i))
                html_visualizer.multirow_print(cells, width=20)
            except KeyError:
                pass
        html_visualizer.write_paragraph("Not found: {}".format(not_found))