Exemplo n.º 1
0
def main(config):
    word_list_path = config['word_list_path']
    claims = get_all_claims()
    claim_d = claims_to_dict(claims)
    stopwords = load_stopwords_for_query()

    word_list_d: Dict = json.load(open(word_list_path, "r"))

    tokenizer = PCTokenizer()

    for query_id in word_list_d:
        claim = claim_d[int(query_id)]
        word_list = word_list_d[query_id]
        base_query_terms = tokenizer.tokenize_stem(claim)
        base_query_terms = list(
            [t for t in base_query_terms if t not in stopwords])
        #print

        new_term_set = set()
        for new_term in word_list:
            t = tokenizer.stemmer.stem(new_term)
            if t not in base_query_terms:
                new_term_set.add(t)

        print()
        print("Claim {}: {}".format(query_id, claim))
        print("base query terms: ", base_query_terms)
        print("new terms: ", new_term_set)
Exemplo n.º 2
0
def pc_new_init_prob():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    claim_d = claims_to_dict(claims)
    bias_plus_word: Counter = load_from_pickle("bias_plus_words")
    tokenizer = PCTokenizer()

    base_p = max(bias_plus_word.values())

    init_p_score_d = {}
    for cid in d_ids:
        c_text = claim_d[cid]
        tokens = tokenizer.tokenize_stem(c_text)

        score_for_cid = Counter()
        for t, cnt in Counter(tokens).items():
            prob = cnt * base_p
            score_for_cid[t] = prob

        for t, score in bias_plus_word.items():
            score_for_cid[t] += score

        score_for_cid = normalize_counter_to_sum1(score_for_cid)
        init_p_score_d[cid] = score_for_cid

    save_to_pickle(init_p_score_d, "pc_dev_new_init_prob")
Exemplo n.º 3
0
def get_valid_terms():
    perspective = get_perspective_dict()
    tokenizer = PCTokenizer()
    voca = set()
    for text in perspective.values():
        voca.update(tokenizer.tokenize_stem(text))
    return voca
Exemplo n.º 4
0
def a_relevant(save_name, q_res_path, claims):
    top_n = 10

    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords = load_stopwords_for_query()
    alpha = 0.5

    tokenizer = PCTokenizer()
    all_passages = []
    entries = []
    num_pos_sum = 0
    num_pos_exists = 0

    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        claim_text = c['text']
        claim_tokens = tokenizer.tokenize_stem(claim_text)

        scores = []
        for t in claim_tokens:
            if t in log_odd:
                scores.append(log_odd[t])

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        passages = iterate_passages(q_res, top_n, get_passage_score)
        num_pos = len(lfilter(lambda x: x[1] > 0, passages))
        num_pos_sum += num_pos
        if num_pos > 0:
            num_pos_exists += 1

        all_passages.extend(passages)
        entries.append((c, passages))

    print("{} claims. {} docs on {} claims".format(len(claims), num_pos_sum,
                                                   num_pos_exists))

    data = entries, all_passages

    save_to_pickle(data, save_name)
Exemplo n.º 5
0
def build_lm(split) -> Iterable[RelevanceModel]:
    tokenizer = PCTokenizer()
    problems, candidate_pool_d = prepare_eval_data(split)
    payload: List[Passage] = get_eval_payload_from_dp(problems)
    for query, problem in zip(payload, problems):
        p = problem
        source_text = p.text1.text
        tokens = tokenizer.tokenize_stem(source_text)
        counter = tokens_to_freq(tokens)
        yield RelevanceModel(query.id.id, query.text, counter)
Exemplo n.º 6
0
def count_df(passages: Iterable[Passage]) -> Counter:
    tokenizer = PCTokenizer()
    df = Counter()
    for p in passages:
        tokens = tokenizer.tokenize_stem(p.text)

        for term in set(tokens):
            df[term] += 1

    return df
Exemplo n.º 7
0
def a_relevant():
    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claims = claims
    top_n = 10
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords = load_stopwords_for_query()
    alpha = 0.3

    tokenizer = PCTokenizer()
    all_passages = []
    entries = []
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        claim_text = c['text']
        claim_tokens = tokenizer.tokenize_stem(claim_text)

        scores = []
        for t in claim_tokens:
            if t in log_odd:
                scores.append(log_odd[t])
        base = average(scores)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        passages = iterate_passages(q_res, top_n, get_passage_score)

        all_passages.extend(passages)
        a_rel_passages = lfilter(lambda x: x[1] > 0, passages)

        entries.append((c, a_rel_passages))

    data = entries, all_passages

    save_to_pickle(data, "pc_train_a_passages")
Exemplo n.º 8
0
 def __init__(self, query_lms: Dict[str, Counter], alpha=0.5):
     self.query_lms = query_lms
     bg_lm = average_counters(list(query_lms.values()))
     self.bg_lm = bg_lm
     self.log_bg_lm: Counter = get_lm_log(bg_lm)
     self.alpha = alpha
     self.log_odd_d: Dict[str, Counter] = {
         k: Counter()
         for k in query_lms.keys()
     }
     self.stopwords = load_stopwords_for_query()
     self.tokenizer = PCTokenizer()
Exemplo n.º 9
0
    def __init__(self, out_dir):
        robust_path = "/mnt/nfs/work3/youngwookim/data/robust04"
        tprint("Loading doc ids")
        self.doc_ids = all_doc_ids_of_interest()
        tprint("Loading robust docs")
        self.docs: Dict[str, str] = trec.load_robust(robust_path)
        tprint("Start processing")

        n_docs = len(self.doc_ids)
        docs_per_job = int((n_docs+n_jobs) / 5)
        self.docs_per_job = docs_per_job
        self.tokenizer = PCTokenizer()
        self.out_dir = out_dir
Exemplo n.º 10
0
class TokenizeForBM25Worker:
    def __init__(self, split, query_group, candidate_docs_d, out_dir):
        self.query_group = query_group
        self.tokenizer = PCTokenizer()
        self.candidate_docs_d = candidate_docs_d
        self.out_dir = out_dir
        self.ms_reader = MSMarcoDataReader(split)

    def work(self, job_id):
        qid_list = self.query_group[job_id]
        ticker = TimeEstimator(len(qid_list))
        missing_rel_cnt = 0
        missing_nrel_cnt = 0

        def empty_doc_fn(query_id, doc_id):
            rel_docs = self.ms_reader.qrel[query_id]
            nonlocal missing_rel_cnt
            nonlocal missing_nrel_cnt
            if doc_id in rel_docs:
                missing_rel_cnt += 1
            else:
                missing_nrel_cnt += 1

        def get_tf(text):
            tokens = self.tokenizer.tokenize_stem(text)
            return Counter(tokens)

        for qid in qid_list:
            if qid not in self.candidate_docs_d:
                continue

            docs: List[MSMarcoDoc] = load_per_query_docs(qid, empty_doc_fn)
            ticker.tick()

            target_docs = self.candidate_docs_d[qid]
            tokens_d = {}
            for d in docs:
                if d.doc_id in target_docs:
                    title_tokens = self.tokenizer.tokenize_stem(d.title)
                    body_sents = sent_tokenize(d.body)
                    body_tf_list = lmap(get_tf, body_sents)
                    tokens_d[d.doc_id] = (title_tokens, body_tf_list)

            if len(tokens_d) < len(target_docs):
                log_variables(job_id, qid)
                print("{} of {} not found".format(len(tokens_d),
                                                  len(target_docs)))

            save_path = os.path.join(self.out_dir, str(qid))
            pickle.dump(tokens_d, open(save_path, "wb"))
Exemplo n.º 11
0
def get_eval_candidates(split, top_k=50) -> List[Tuple[int, List[Dict]]]:
    # split -> claims
    d_ids = load_claim_ids_for_split(split)
    claims: List[Dict] = get_claims_from_ids(d_ids)
    tokenizer = PCTokenizer()

    def get_candidates(c: Dict) -> Tuple[int, List[Dict]]:
        cid = c["cId"]
        assert type(cid) == int
        claim_text = c["text"]
        lucene_results = es_helper.get_perspective_from_pool(claim_text, top_k)
        candidate_list = []
        for rank, (_text, _pid, _score) in enumerate(lucene_results):
            rationale = "es_rank={} , es_score={}".format(rank, _score)
            p_entry = {
                'cid': cid,
                'pid': _pid,
                'claim_text': claim_text,
                'perspective_text': _text,
                'p_tokens': tokenizer.tokenize_stem(_text),
                'rationale': rationale,
            }
            candidate_list.append(p_entry)
        return cid, candidate_list

    candidates: List[Tuple[int, List[Dict]]] = lmap(get_candidates, claims)
    return candidates
Exemplo n.º 12
0
class Worker:
    def __init__(self, out_dir):
        robust_path = "/mnt/nfs/work3/youngwookim/data/robust04"
        tprint("Loading doc ids")
        self.doc_ids = all_doc_ids_of_interest()
        tprint("Loading robust docs")
        self.docs: Dict[str, str] = trec.load_robust(robust_path)
        tprint("Start processing")

        n_docs = len(self.doc_ids)
        docs_per_job = int((n_docs+n_jobs) / 5)
        self.docs_per_job = docs_per_job
        self.tokenizer = PCTokenizer()
        self.out_dir = out_dir

    def work(self, job_id):
        doc_id_to_count = dict()
        st = job_id * self.docs_per_job
        ed = st + self.docs_per_job
        todo = self.doc_ids[st:ed]
        ticker = TimeEstimator(len(todo))
        for doc_id in todo:
            try:
                text = self.docs[doc_id]
                tokens = self.tokenizer.tokenize_stem(text)
                counter = Counter(tokens)
                doc_id_to_count[doc_id] = counter
                ticker.tick()
            except KeyError as e:
                print(e)
                print("key error")
                pass

        save_path = os.path.join(self.out_dir, str(job_id))
        pickle.dump(doc_id_to_count, open(save_path, "wb"))
Exemplo n.º 13
0
class BM25:
    def __init__(self, df, num_doc, avdl, k1=0.01, k2=100, b=0.6):
        self.core = BM25Bare(df, num_doc, avdl, k1, k2, b)
        self.tokenizer = PCTokenizer()

    def score(self, query, text) -> NamedNumber:
        q_terms = self.tokenizer.tokenize_stem(query)
        t_terms = self.tokenizer.tokenize_stem(text)
        q_tf = Counter(q_terms)
        t_tf = Counter(t_terms)
        return self.core.score_inner(q_tf, t_tf)

    def term_idf_factor(self, term):
        return self.core.term_idf_factor(term)

    def score_inner(self, q_tf, t_tf) -> NamedNumber:
        return self.core.score_inner(q_tf, t_tf)
Exemplo n.º 14
0
 def __init__(self, bm25_module, max_seq_length, include_title=False):
     self.max_seq_length = max_seq_length
     self.bm25_module = bm25_module
     pc_tokenize = PCTokenizer()
     self.tokenize_stem = pc_tokenize.tokenize_stem
     self.include_title = include_title
     bert_tokenizer = get_tokenizer()
     self.bert_tokenize = bert_tokenizer.tokenize
Exemplo n.º 15
0
def a_relevant():
    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    claims = claims[:10]
    top_n = 100
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)

    stopwords = load_stopwords_for_query()
    alpha = 0.7

    tokenizer = PCTokenizer()
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        docs = []
        for i in range(top_n):
            try:
                doc = load_doc(q_res[i].doc_id)
                docs.append(doc)
            except KeyError:
                docs.append(None)
                pass

        print(c['text'])
        rows = []
        for rank, doc in enumerate(docs):
            if doc is None:
                rows.append((rank, "-", "-"))
                continue

            scores = get_doc_score(doc, get_passage_score)
            avg_score = average(scores)
            max_score = max(scores)
            rows.append((rank, avg_score, max_score))

        print_table(rows)
Exemplo n.º 16
0
def build_baseline_lms(claims):
    tokenizer = PCTokenizer()

    def get_claim_lm(claim):
        cid = claim["cId"]
        counter = tokens_to_freq(tokenizer.tokenize_stem(claim['text']))
        return ClaimLM(cid, claim['text'], counter)

    claim_lms = lmap(get_claim_lm, claims)
    return claim_lms
Exemplo n.º 17
0
class LMScorer:
    def __init__(self, query_lms: Dict[str, Counter], alpha=0.5):
        self.query_lms = query_lms
        bg_lm = average_counters(list(query_lms.values()))
        self.bg_lm = bg_lm
        self.log_bg_lm: Counter = get_lm_log(bg_lm)
        self.alpha = alpha
        self.log_odd_d: Dict[str, Counter] = {
            k: Counter()
            for k in query_lms.keys()
        }
        self.stopwords = load_stopwords_for_query()
        self.tokenizer = PCTokenizer()

    def score(self, query_id, raw_tokens) -> float:
        stemmed_tokens = self.filter_and_stem(raw_tokens)
        return self._get_score_from_stemmed_tokens(query_id, stemmed_tokens)

    def filter_and_stem(self, tokens):
        stemmed_tokens = []
        for t in tokens:
            if t in self.stopwords:
                pass
            else:
                try:
                    stemmed_t = self.tokenizer.stemmer.stem(t)
                    stemmed_tokens.append(stemmed_t)
                except UnicodeDecodeError:
                    pass
        return stemmed_tokens

    def score_text(self, query_id, text):
        tokens = self.tokenizer.tokenize_stem(text)
        tokens = list([t for t in tokens if t not in self.stopwords])
        return self._get_score_from_stemmed_tokens(query_id, tokens)

    def _get_score_from_stemmed_tokens(self, query_id, tokens) -> float:
        log_odd_d: Counter = self.log_odd_d[query_id]
        lm = self.query_lms[query_id]

        def get_score(token: str) -> float:
            if token in log_odd_d:
                return log_odd_d[token]

            if token in lm or token in self.bg_lm:
                prob_pos = lm[token] * (
                    1 - self.alpha) + self.bg_lm[token] * self.alpha
                pos_log = math.log(prob_pos)
            else:
                pos_log = 0
            score = pos_log - self.log_bg_lm[token]
            log_odd_d[token] = score
            return score

        return average(lmap(get_score, tokens))
Exemplo n.º 18
0
def get_extended_eval_candidate(split) -> Dict[int, List[int]]:
    bm25 = get_bm25_module()
    d_ids = load_claim_ids_for_split(split)
    claims: List[Dict] = get_claims_from_ids(d_ids)
    cid_to_pids: Dict[int, List[int]] = get_claim_perspective_id_dict2()
    tokenizer = PCTokenizer()

    def get_tf_idf(c: Counter):
        r = Counter()
        for t, cnt in c.items():
            tfidf = bm25.term_idf_factor(t) * cnt
            r[t] = tfidf
        return r

    def get_candidates(c: Dict) -> Tuple[int, List[int]]:
        cid = c["cId"]
        assert type(cid) == int
        claim_text = c["text"]
        claim_tokens = tokenizer.tokenize_stem(claim_text)
        top_k = 50
        lucene_results = es_helper.get_perspective_from_pool(claim_text, top_k)
        candidate_list: List[int] = []

        for rank, (_text, _pid, _score) in enumerate(lucene_results):
            candidate_list.append(_pid)

        gold_pids = cid_to_pids[int(cid)]
        hard_candidate = []
        mismatch_voca = Counter()
        for pid in gold_pids:
            if pid not in candidate_list:
                hard_candidate.append(pid)
                p_text = perspective_getter(pid)
                p_tokens = tokenizer.tokenize_stem(p_text)

                for t in p_tokens:
                    if t not in claim_tokens:
                        mismatch_voca[t] += 1

        candidate_list.extend(hard_candidate)
        mismatch_tf_idf = get_tf_idf(mismatch_voca)
        new_qterms = left(mismatch_tf_idf.most_common(30))
        lucene_results = es_helper.get_perspective_from_pool(
            " ".join(new_qterms), top_k)

        for rank, (_text, _pid, _score) in enumerate(lucene_results):
            if _pid not in candidate_list:
                candidate_list.append(_pid)

        return cid, candidate_list

    candidates: List[Tuple[int, List[int]]] = lmap(get_candidates, claims)
    return dict(candidates)
Exemplo n.º 19
0
def main():
    split = "train"
    subjectivity_path = sys.argv[1]
    q_res_path = sys.argv[2]
    ranked_list: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)

    # load LM
    claim_lms: List[ClaimLM] = build_gold_lms_for_sub_split(split)
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)
    alpha = 0.1
    stopwords = load_stopwords_for_query()
    # load subjectivity predictions.
    subj_d: Dict[str, Tuple[int, int]] = load_subjectivity(subjectivity_path)
    doc_ids = subj_d.keys()
    preload_man.preload(TokenizedCluewebDoc, doc_ids)
    tokenizer = PCTokenizer()

    lm_scores = []
    rates = []
    num_subj_list = []
    num_sent_list = []
    for claim_lm in claim_lms:
        qid = str(claim_lm.cid)
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        for entry in ranked_list[qid]:
            if entry.doc_id in subj_d:
                tokens = load_doc(entry.doc_id)
                assert type(tokens[0]) == str
                lm_score = get_passage_score(tokens)
                num_subj, num_sent = subj_d[entry.doc_id]
                rate = num_subj / num_sent
                lm_scores.append(lm_score)
                rates.append(rate)
                num_subj_list.append(num_subj)
                num_sent_list.append(num_sent)



    print("lm scores correlation with ")
    print("rates: ", pearsonr(lm_scores, rates))
    print("num subj: ", pearsonr(lm_scores, num_subj_list))
    print("num sent: ", pearsonr(lm_scores, num_sent_list))
Exemplo n.º 20
0
def run_lm2():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 5
    tokenizer = PCTokenizer()
    tf_d = {
        c['cId']: Counter(nltk.tokenize.word_tokenize(c['text']))
        for c in claims
    }
    bm25 = get_bm25_module()
    ctf = get_perspective_tf()
    pred = predict_by_lm(tf_d, ctf, bm25, claims, top_k)
    print(evaluate(pred))
Exemplo n.º 21
0
def predict_by_bm25_rm(bm25_module: BM25, rm_info: Dict[str, List[Tuple[str,
                                                                        str]]],
                       claims, top_k) -> List[Tuple[str, List[Dict]]]:

    cid_to_text: Dict[int, str] = claims_to_dict(claims)
    tokenizer = PCTokenizer()

    def stem_merge(score_list: List[Tuple[str, float]]) -> Counter:
        c = Counter()
        for k, v in score_list:
            try:
                new_k = tokenizer.stemmer.stem(k)
                c[new_k] += v
            except UnicodeDecodeError:
                pass
        return c

    rm_info: Dict[str,
                  List[Tuple[str,
                             float]]] = dict_value_map(parse_float, rm_info)
    rm_info: Dict[str,
                  List[Tuple[str,
                             float]]] = dict_value_map(normalize_scores,
                                                       rm_info)
    rm_info_c: Dict[str, Counter] = dict_value_map(stem_merge, rm_info)
    print(len(rm_info_c.keys()))
    print(len(claims))
    not_found = set()

    def scorer(lucene_score, query_id) -> NamedNumber:
        claim_id, p_id = query_id.split("_")
        c_text = cid_to_text[int(claim_id)]
        p_text = perspective_getter(int(p_id))
        score: NamedNumber = bm25_module.score(c_text, p_text)

        nclaim_id = int(claim_id)
        if nclaim_id in rm_info:
            ex_qtf = rm_info_c[nclaim_id]
            p_tokens = tokenizer.tokenize_stem(p_text)
            ex_score = bm25_module.score_inner(ex_qtf, Counter(p_tokens))
            new_info = score.name + "({})".format(ex_score.name)
            score = NamedNumber(score + ex_score, new_info)
        else:
            not_found.add(claim_id)
        return score

    r = predict_interface(claims, top_k, scorer)
    print(not_found)
    return r
Exemplo n.º 22
0
def get_query_lms(split) -> Dict[str, Counter]:
    evi_dict: Dict[int, str] = load_evidence_dict()
    tokenzier = PCTokenizer()
    queries = get_qck_queries(split)
    evi_gold_dict: Dict[str, List[int]] = evidence_gold_dict_str_qid()

    def get_evidence_texts(query: QCKQuery) -> List[str]:
        query_id = query.query_id
        e_ids: List[int] = evi_gold_dict[query_id]
        return list([evi_dict[eid] for eid in e_ids])

    def get_query_lm(query: QCKQuery) -> Counter:
        return text_list_to_lm(tokenzier, get_evidence_texts(query))

    lms = lmap(get_query_lm, queries)
    qids = lmap(QCKQuery.get_id, queries)
    query_lms: Dict[str, Counter] = dict(zip(qids, lms))
    return query_lms
Exemplo n.º 23
0
    def __init__(self, split, query_group, candidate_docs_d, max_sent_length,
                 max_title_length, out_dir):
        self.query_group = query_group
        self.candidate_docs_d = candidate_docs_d
        self.out_dir = out_dir
        self.bert_tokenizer = get_tokenizer()
        self.stem_tokenizer = PCTokenizer()
        self.max_sent_length = max_sent_length
        self.max_title_length = max_title_length
        self.ms_reader = MSMarcoDataReader(split)
        self.text_dir_name = 'text'
        self.bert_tokens_dir_name = 'bert_tokens'
        self.stemmed_tokens_dir_name = 'stemmed_tokens'

        for name in [
                self.text_dir_name, self.bert_tokens_dir_name,
                self.stemmed_tokens_dir_name
        ]:
            exist_or_mkdir(os.path.join(self.out_dir, name))
Exemplo n.º 24
0
def build_gold_lms(claims) -> List[ClaimLM]:
    gold = get_claim_perspective_id_dict()
    tokenizer = PCTokenizer()

    def get_cluster_lm(cluster: List[int]) -> Counter:
        p_text_list: List[str] = lmap(perspective_getter, cluster)
        tokens_list: List[List[str]] = lmap(tokenizer.tokenize_stem,
                                            p_text_list)
        counter_list = lmap(tokens_to_freq, tokens_list)
        counter = average_counters(counter_list)
        return counter

    def get_claim_lm(claim) -> ClaimLM:
        cid = claim["cId"]
        counter_list: List[Counter] = lmap(get_cluster_lm, gold[cid])
        counter: Counter = average_counters(counter_list)
        return ClaimLM(cid, claim['text'], counter)

    claim_lms = lmap(get_claim_lm, claims)
    return claim_lms
Exemplo n.º 25
0
def get_train_passage_a_lms():
    data = load_from_pickle("pc_train_a_passages")
    entries, all_passages = data
    voca = get_valid_terms()

    tokenizer = PCTokenizer()
    bg_tf = tokens_to_freq(flatten(left(all_passages)))
    bg_tf = simplify_tf(bg_tf, voca)
    alpha = 0.99 # Smoothing with claim
    alpha2 = 0.3 # Smoothing with collection documents
    r = []
    ticker = TimeEstimator(len(entries))
    for c, passages in entries:
        r_tf = passage_to_lm(tokenizer, c, passages, alpha)
        r_tf = simplify_tf(r_tf, voca)
        c_tf = smooth_ex(r_tf, bg_tf, alpha2)
        r.append(ClaimLM(c['cId'], c['text'], c_tf))
        ticker.tick()

    return r
Exemplo n.º 26
0
def predict_by_lm(claim_lms: List[ClaimLM],
                  claims,
                  top_k) -> List[Tuple[str, List[Dict]]]:

    alpha = 0.1
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    tokenizer = PCTokenizer()
    print("Eval log odds")
    claim_log_odds_dict = {str(c_lm.cid): get_log_odd(c_lm, bg_lm, alpha) for c_lm in claim_lms}

    def scorer(lucene_score, query_id) -> NamedNumber:
        claim_id, p_id = query_id.split("_")
        p_text = perspective_getter(int(p_id))
        tokens = tokenizer.tokenize_stem(p_text)
        c_lm = claim_log_odds_dict[claim_id]
        reason = " ".join(["{0} ({1:.2f})".format(t, c_lm[t]) for t in tokens])
        score = sum([c_lm[t] for t in tokens])
        return NamedNumber(score, reason)

    r = predict_interface(claims, top_k, scorer)
    return r
Exemplo n.º 27
0
def build_df():
    claims, val = train_split()
    gold = get_claim_perspective_id_dict()

    tokenizer = PCTokenizer()
    df = Counter()

    dl_list = []
    for claim in claims:
        cid = claim["cId"]
        gold_pids = flatten(gold[cid])
        p_text_list: List[str] = lmap(perspective_getter, gold_pids)
        tokens_list = lmap(tokenizer.tokenize_stem, p_text_list)
        dl_list.extend(lmap(len, tokens_list))

        for t in set(flatten(tokens_list)):
            df[t] += 1

    print(dl_list)
    print("Avdl", average(dl_list))
    print(len(claims))
    print(df.most_common(30))
    save_to_pickle(df, "pc_df")
Exemplo n.º 28
0
def main():
    split = "train"
    resource = ProcessedResource10docMulti(split)

    query_group: List[List[QueryID]] = load_query_group(split)
    msmarco_passage_qrel_path = at_data_dir("msmarco", "qrels.train.tsv")
    passage_qrels: QRelsDict = load_qrels_structured(msmarco_passage_qrel_path)

    qids = query_group[0]
    qids = qids[:100]
    pickle_name = "msmarco_passage_doc_analyze_passage_dict_evidence_loc"
    try:
        passage_dict = load_from_pickle(pickle_name)
    except FileNotFoundError:
        print("Reading passages...")
        passage_dict = get_passages(qids, passage_qrels)
        save_to_pickle(passage_dict, pickle_name)
    def get_rel_doc_id(qid):
        if qid not in resource.get_doc_for_query_d():
            raise KeyError
        for doc_id in resource.get_doc_for_query_d()[qid]:
            label = resource.get_label(qid, doc_id)
            if label:
                return doc_id
        raise KeyError

    def translate_token_idx_to_sent_idx(stemmed_body_tokens_list, loc_in_body):
        acc = 0
        for idx, tokens in enumerate(stemmed_body_tokens_list):
            acc += len(tokens)
            if loc_in_body < acc:
                return idx
        return -1

    pc_tokenize = PCTokenizer()
    bert_tokenizer = get_tokenizer()

    for qid in qids:
        try:
            doc_id = get_rel_doc_id(qid)
            stemmed_tokens_d = resource.get_stemmed_tokens_d(qid)
            stemmed_title_tokens, stemmed_body_tokens_list = stemmed_tokens_d[doc_id]
            rel_passages = list([passage_id for passage_id, score in passage_qrels[qid].items() if score])
            success = False
            found_idx = -1
            for rel_passage_id in rel_passages:
                passage_text = passage_dict[rel_passage_id].strip()
                passage_tokens = pc_tokenize.tokenize_stem(passage_text)
                stemmed_body_tokens_flat = lflatten(stemmed_body_tokens_list)
                n, log = lcs(passage_tokens, stemmed_body_tokens_flat, True)
                if len(passage_tokens) > 4 and n > len(passage_tokens) * 0.7 and n > 0:
                    success = True
                    _, loc_in_body = log[0]

                    sent_idx = translate_token_idx_to_sent_idx(stemmed_body_tokens_list, loc_in_body)
                    prev = stemmed_body_tokens_flat[:loc_in_body]

                    loc_by_bert_tokenize = len(bert_tokenizer.tokenize(" ".join(prev)))
                    print(sent_idx, loc_in_body, loc_by_bert_tokenize, len(stemmed_body_tokens_list))
                    found_idx = sent_idx
            if not success:
                print("Not found. doc_lines={} passage_len={}".format(len(stemmed_body_tokens_list), len(passage_tokens)))

        except KeyError:
            pass
Exemplo n.º 29
0
 def __init__(self, df, num_doc, avdl, k1=0.01, k2=100, b=0.6):
     self.core = BM25Bare(df, num_doc, avdl, k1, k2, b)
     self.tokenizer = PCTokenizer()
Exemplo n.º 30
0
def doc_lm_scoring():
    gold = get_claim_perspective_id_dict()

    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claims = claims
    top_n = 10
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords = load_stopwords_for_query()
    alpha = 0.5

    html_visualizer = HtmlVisualizer("doc_lm_doc_level.html")

    tokenizer = PCTokenizer()
    random_passages = []
    num_pos_sum = 0
    num_pos_exists = 0
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        html_visualizer.write_headline("{} : {}".format(c['cId'], c['text']))
        # for cluster in clusters:
        #     html_visualizer.write_paragraph("---")
        #     p_text_list: List[str] = lmap(perspective_getter, cluster)
        #     for text in p_text_list:
        #         html_visualizer.write_paragraph(text)
        #     html_visualizer.write_paragraph("---")
        claim_lm = claim_lms_d[c['cId']]
        topic_lm_prob = smooth(claim_lm.LM, bg_lm, alpha)
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        claim_text = c['text']
        claim_tokens = tokenizer.tokenize_stem(claim_text)

        scores = []
        for t in claim_tokens:
            if t in log_odd:
                scores.append(log_odd[t])
        threshold = average(scores)

        s = "\t".join(left(log_odd.most_common(30)))
        html_visualizer.write_paragraph("Log odd top: " + s)
        not_found = set()

        def get_log_odd(x):
            x = tokenizer.stemmer.stem(x)
            if x not in log_odd:
                not_found.add(x)
            return log_odd[x]

        def get_probs(x):
            x = tokenizer.stemmer.stem(x)
            if x not in topic_lm_prob:
                not_found.add(x)
            return topic_lm_prob[x]

        def get_passage_score(p):
            return sum([log_odd[tokenizer.stemmer.stem(t)]
                        for t in p]) / len(p) if len(p) > 0 else 0

        passages = iterate_passages(q_res, top_n, get_passage_score)

        passages.sort(key=lambda x: x[1], reverse=True)
        html_visualizer.write_paragraph("Threshold {}".format(threshold))

        top5_scores = right(passages[:5])
        bot5_scores = right(passages[-5:])

        if len(random_passages) > 5:
            random_sel_pssages = random.choices(random_passages, k=5)
        else:
            random_sel_pssages = []
        random5_scores = lmap(get_passage_score, random_sel_pssages)

        def score_line(scores):
            return " ".join(lmap(two_digit_float, scores))

        html_visualizer.write_paragraph("top 5: " + score_line(top5_scores))
        html_visualizer.write_paragraph("bot 5: " + score_line(bot5_scores))
        html_visualizer.write_paragraph("random 5: " +
                                        score_line(random5_scores))

        num_pos = len(lfilter(lambda x: x[1] > 0, passages))
        num_pos_sum += num_pos
        if num_pos > 0:
            num_pos_exists += 1

        def print_doc(doc, html_visualizer, score):
            cells = lmap(lambda x: get_cell_from_token(x, get_log_odd(x)), doc)
            html_visualizer.write_headline("score={}".format(score))
            html_visualizer.multirow_print(cells, width=20)

        random_passages.extend(left(passages))
        if threshold < 0:
            continue
        for doc, score in passages:
            if score < 0:
                break
            print_doc(doc, html_visualizer, score)

        html_visualizer.write_headline("Bottom 5")
        for doc, score in passages[-5:]:
            print_doc(doc, html_visualizer, score)

    print("{} claims. {} docs on {} claims".format(len(claims), num_pos_sum,
                                                   num_pos_exists))