Пример #1
0
def enum_true_instance(sel_per_review=0) -> Iterable[Tuple[Claim, Claim, str]]:
    reviews: List[Review] = load_parsed()

    def rank_fn(e = Tuple[Claim, Claim]):
        claim1, claim2 = e
        return num_common_terms(claim1.text, claim2.text)

    for review in reviews:
        pair_per_review = []
        yes_claim_list = lfilter(lambda c: c.assertion == "YS", review.claim_list)
        no_claim_list = lfilter(lambda c: c.assertion == "NO", review.claim_list)

        for yes_claim in yes_claim_list:
            for no_claim in no_claim_list:
                e = yes_claim, no_claim
                pair_per_review.append(e)

        pair_per_review.sort(key=rank_fn, reverse=True)

        if sel_per_review == 0:
            pairs = pair_per_review
        else:
            pairs = pair_per_review[:sel_per_review]

        for claim1, claim2 in pairs:
            yield claim1, claim2
Пример #2
0
def enum_true_instance() -> Iterable[Tuple[Claim, Claim, str]]:
    reviews: List[Review] = load_parsed()
    for review in reviews:
        yes_claim_list = lfilter(lambda c: c.assertion == "YS",
                                 review.claim_list)
        no_claim_list = lfilter(lambda c: c.assertion == "NO",
                                review.claim_list)

        for yes_claim in yes_claim_list:
            for no_claim in no_claim_list:
                yield yes_claim, no_claim, "Yes/No from a same review"
                yield no_claim, yes_claim, "No/Yes from a same review"
Пример #3
0
    def __init__(self, split):
        super(ProcessedResourcePredict10, self).__init__(split)

        candidate_docs_d: Dict[QueryID, List[str]] = top100_doc_ids(split)
        new_candidate_docs_d: Dict[QueryID, List[str]] = {}
        for qid, doc_ids in candidate_docs_d.items():
            pos_doc_ids = lfilter(lambda doc_id: self.get_label(qid, doc_id), doc_ids)
            neg_doc_ids = lfilter(lambda doc_id: not self.get_label(qid, doc_id), doc_ids)
            n_neg = 10 - len(pos_doc_ids)
            random.shuffle(neg_doc_ids)
            doc_ids_selected = pos_doc_ids + neg_doc_ids[:n_neg]
            assert len(doc_ids_selected) <= 10
            new_candidate_docs_d[qid] = doc_ids_selected
        self.candidate_doc_d = new_candidate_docs_d
Пример #4
0
def enum_neg_instance2() -> Iterable[Tuple[Claim, Claim, str]]:
    reviews: List[Review] = load_parsed()
    for review in reviews:
        yes_claim_list = lfilter(lambda c: c.assertion == "YS",
                                 review.claim_list)
        no_claim_list = lfilter(lambda c: c.assertion == "NO",
                                review.claim_list)

        for c1, c2 in combinations(yes_claim_list, 2):
            yield c1, c2, "{}/{} from a same review".format(
                c1.assertion, c2.assertion)

        for c1, c2 in combinations(no_claim_list, 2):
            yield c1, c2, "{}/{} from a same review".format(
                c1.assertion, c2.assertion)
Пример #5
0
def combine_subjectivity_annotation(
        doc: MPQARawDoc, ann_list: List[MPQAAnnLine]) -> MPQADocSubjectiveInfo:
    def is_sentence_annot(ann: MPQAAnnLine) -> bool:
        return ann.ann_type == "GATE_sentence"

    # identify sentences
    sentences = lfilter(is_sentence_annot, ann_list)
    sentences.sort(key=lambda s: s.span[0])
    if not sentences:
        print(ann_list)
    assert sentences

    def is_it_about_subjective(ann: MPQAAnnLine) -> bool:
        return ann.ann_type in [EXPRESSIVE_SUBJECTIVITY, DIRECT_SUBJECTIVITY]

    # filter subjectivity related ones
    ann_about_subjectivity = lfilter(is_it_about_subjective, ann_list)

    def find_sentence(span) -> int:
        st, ed = span
        for s in sentences:
            st_s, ed_s = s.span
            if st_s <= st and ed <= ed_s:
                return s.id

        raise KeyError()

    # Match sentence with annotation
    global num_error

    s_list_to_ann_list: Dict[int, List] = defaultdict(list)
    for annot in ann_about_subjectivity:
        try:
            if annot.span == (0, 0):
                continue
            sentence_id = find_sentence(annot.span)
            s_list_to_ann_list[sentence_id].append(annot)
        except KeyError:
            num_error += 1

    annot_sent_list = []
    for raw_sent in sentences:
        ann_list = s_list_to_ann_list[raw_sent.id]
        tags = list([a.ann_type for a in ann_list])
        annot_sent = Sentence(raw_sent.id, raw_sent.span, tags, ann_list)
        annot_sent_list.append(annot_sent)

    return MPQADocSubjectiveInfo(doc.doc_id, doc.content, annot_sent_list)
Пример #6
0
def filter_avail(claims):
    cpid_resolute: Dict[str, CPID] = load_cpid_resolute(
        FileName("resolute_dict_580_606"))
    cid_list: List[int] = lmap(lambda x: int(x.split("_")[0]),
                               cpid_resolute.values())
    cid_list: Set[int] = set(cid_list)
    return lfilter(lambda x: x['cId'] in cid_list, claims)
Пример #7
0
    def generate_instances(self, claim: Dict,
                           data_id_manager) -> List[Payload]:
        cid = claim['cId']
        claim = claim['text']
        perspectives = self.candidate_perspective[cid]
        passages = self.cid_to_passages[cid]

        if self.filter_good:
            filter_condition = score_over_zero
        else:

            def filter_condition(dummy):
                return True

        good_passages: List[List[str]] = left(
            lfilter(filter_condition, passages))
        output = []
        for pid in perspectives:
            is_correct = any([pid in cluster for cluster in self.gold[cid]])
            for passage_idx, passage in enumerate(good_passages):
                perspective = perspective_getter(pid)
                info = {'cid': cid, 'pid': pid, 'passage_idx': passage_idx}
                p = Payload(passage, claim, perspective,
                            data_id_manager.assign(info), is_correct)
                output.append(p)

        return output
Пример #8
0
    def generate_instances(self, claim: Dict,
                           data_id_manager: DataIDManager) -> List[Instance]:
        cid = claim['cId']
        claim = claim['text']

        passages = self.cid_to_passages[cid]
        good_passages: List[List[str]] = left(
            lfilter(score_over_zero, passages))
        not_good_passages: List[List[str]] = left(
            lfilter_not(score_over_zero, passages))

        n_good = len(good_passages)
        n_not_good = len(not_good_passages)
        random_passage = list([self.random_sample(cid) for _ in range(10)])

        # len(pair_list_g_ng) = n_not_good   ( assuming n_not_good > n_good)

        def make_instance(passage, label):
            info = {'cid': cid}
            return Instance(claim, passage, label,
                            data_id_manager.assign(info))

        l1 = lmap(lambda p: make_instance(p, 1), good_passages)
        l2 = lmap(lambda p: make_instance(p, 0), not_good_passages)
        l3 = lmap(lambda p: make_instance(p, 0), random_passage)
        print("g: ng : rand = {} : {} : {}".format(len(l1), len(l2), len(l3)))
        return l1 + l2 + l3
Пример #9
0
def main():
    train_data = load_argu_data_from_pickle("training")
    averager = Averager()

    for text, label in train_data[:200]:
        print(label)
        raw_text = text.text
        text_list: List[str] = raw_text.split("\n\n")

        def is_empty_line(l):
            return l.strip()

        text_list = lfilter(is_empty_line, text_list)
        sentence_list = lflatten(lmap(sent_tokenize, text_list))

        def is_reference(l):
            if len(l) < 3:
                return False
            if l[0] == "[" and l[1] == "i":
                return True
            if l[0] == "[" and l[2] == "]":
                return True
            if "http://" in l:
                return True
            return False

        sentence_list = lfilter_not(is_reference, sentence_list)
        averager.append(len(sentence_list))
    print(averager.get_average())
Пример #10
0
def work(st, ed):
    st = int(st)
    ed = int(ed)
    q_config_id = Q_CONFIG_ID_BM25_10000
    ci = DynRankedListInterface(make_doc_query, q_config_id)
    all_data_points = load_train_data_point()

    print("Running {}~{} of {}".format(st, ed, len(all_data_points)))
    num_request = 10000
    todo = all_data_points[st:ed]
    not_done = lfilter(partial(db_not_contains, q_config_id), todo)
    queries: List[DocQuery] = lmap(datapoint_to_doc_query, not_done)
    print("Executing {} queries".format(len(queries)))
    ranked_list_dict: Dict[str, List[SimpleRankedListEntry]] = \
        send_doc_queries(ci.disk_path, num_request, queries, 600)
    qid_list = lmap(dp_to_qid, not_done)

    print("{} of {} succeed".format(len(ranked_list_dict), len(queries)))

    def add_to_db(query_id: str):
        if query_id in ranked_list_dict:
            r = ranked_list_dict[query_id]
            q_res_id: str = "{}_{}".format(query_id, q_config_id)
            if not has_key(QueryResult, q_res_id):
                save(QueryResult, q_res_id, r)

    foreach(add_to_db, qid_list)
    flush()
Пример #11
0
def collect_good_passages(data_id_to_info: Dict[int, Dict],
                          passage_score_path: FilePath,
                          config: Dict
                          ):
    recover_subtokens = get_recover_subtokens()

    score_cut = config['score_cut']
    top_k = config['top_k']
    grouped_scores: Dict[int, List[Dict]] = read_passage_scores(passage_score_path, data_id_to_info, recover_subtokens)

    def get_score_from_logit(logits):
        return scipy.special.softmax(logits)[1]

    def is_good(d: Dict):
        score = get_score_from_logit(d['logits'])
        return score >= score_cut

    output = []
    num_passges = []
    for cid, passages in grouped_scores.items():
        good_passages = lfilter(is_good, passages)
        good_passages.sort(key=lambda d: get_score_from_logit(d['logits']), reverse=True)
        num_passges.append(len(good_passages))
        if good_passages:
            output.append((cid, good_passages[:top_k]))
        else:
            scores = list([get_score_from_logit(d['logits']) for d in passages])
            scores.sort(reverse=True)

    print(num_passges)
    print("{} of {} claims has passages".format(len(output), len(grouped_scores)))
    return output
Пример #12
0
def select_vertices_edges(counter) -> Tuple[Edges, List[Any]]:
    def is_not_funct(word):
        if len(word) > 2:
            return True

        return word not in ",.)(:'\"`-?''``,%"

    #print("total pairs", len(counter))
    vertice_counter = get_vertices_info(counter)
    #print("total terms", len(vertice_counter))
    common_vertices = list([(k, cnt) for k, cnt in vertice_counter.items()
                            if cnt > 100])
    common_vertices.sort(key=lambda x: x[1], reverse=True)
    # print(left(common_vertices[:20]))
    # print("Terms with more than 100 appearance : ", len(common_vertices))
    valid_vertices: List[Any] = lfilter(is_not_funct, left(common_vertices))
    valid_pairs = list([((a, b), cnt) for (a, b), cnt in counter.items()
                        if a in valid_vertices and b in valid_vertices])
    # print("valid pairs", len(valid_pairs))
    unnormalized_edges: Dict[Any, Dict] = {}
    for (a, b), cnt in valid_pairs:
        if a not in unnormalized_edges:
            unnormalized_edges[a] = Counter()
        unnormalized_edges[a][b] += cnt

    edges = {}
    for vertex_a, raw_edges in unnormalized_edges.items():
        total = sum(raw_edges.values())
        local_edges = Counter()
        for vertex_b, cnt in raw_edges.items():
            prob = cnt / total
            local_edges[vertex_b] = prob
        edges[vertex_a] = local_edges
    return Edges(edges), valid_vertices
Пример #13
0
def extract_qk_unit(info_path, pred_path, config_path) -> Iterable[QKUnit]:
    info = load_combine_info_jsons(info_path, qk_convert_map, False)
    predictions = join_prediction_with_info(pred_path, info)
    grouped: Dict[str, List[Dict]] = group_by(predictions,
                                              lambda x: x['query'].query_id)
    config = json.load(open(config_path, "r"))
    score_cut = config['score_cut']
    top_k = config['top_k']

    def is_good(entry):
        return get_regression_score(entry) > score_cut

    select_rate_list = []
    qk_units = []
    for qid, entries in grouped.items():
        any_entry = entries[0]
        query = any_entry['query']
        good_entries = lfilter(is_good, entries)
        good_entries.sort(key=get_regression_score, reverse=True)
        selected_entries = good_entries[:top_k]
        if not selected_entries:
            continue
        kd_list = lmap(lambda x: x['kdp'], selected_entries)
        qk_units.append((query, kd_list))

        select_rate = len(selected_entries) / len(entries)
        select_rate_list.append(select_rate)

    print("{} of {} qk units selected".format(len(qk_units), len(grouped)))
    print("average select rate", average(select_rate_list))
    return qk_units
Пример #14
0
def get_feature_binary_model(claim_id,
                             perspective_id,
                             claim_text,
                             perspective_text,
                             ci: DynRankedListInterface,
                             is_mention_fn: Callable[[Counter[str], str, str], bool],
                             ) -> Tuple[Counter, int]:

    def is_mention(doc: Counter) -> bool:
        return is_mention_fn(doc, claim_text, perspective_text)

    print(claim_id, perspective_id)
    ranked_docs: List[SimpleRankedListEntry] = ci.query(claim_id, perspective_id, claim_text, perspective_text)
    ranked_docs = ranked_docs[:100]
    print("{} docs in ranked list".format(len(ranked_docs)))

    doc_id_list: List[str] = lmap(get_doc_id, ranked_docs)

    tf_d = load_multiple(CluewebDocTF, doc_id_list, True)
    not_found = []
    for idx, doc_id in enumerate(doc_id_list):
        if doc_id not in tf_d:
            not_found.append(idx)

    ranked_docs_tf = tf_d.values()
    mentioned_docs: List[Counter] = lfilter(is_mention, ranked_docs_tf)
    print("Found doc", len(tf_d), "mentioned doc", len(mentioned_docs))

    docs_rel_freq: List[Counter] = lmap(div_by_doc_len, mentioned_docs)
    num_doc: int = len(docs_rel_freq)
    p_w_m: Counter = average_tf_over_docs(docs_rel_freq, num_doc)

    return p_w_m, num_doc
Пример #15
0
def extract_predictions(score_d, split):
    candidates: List[Tuple[int, List[Dict]]] = get_eval_candidates_from_pickle(
        split)
    # only evalaute what's available
    valid_cids: Set[int] = set(left(score_d.keys()))
    sub_candidates: List[Tuple[int, List[Dict]]] = lfilter(
        lambda x: x[0] in valid_cids, candidates)
    print("{} claims are evaluated".format(len(sub_candidates)))

    def make_decisions(e: Tuple[int, List[Dict]]):
        cid, p_list = e
        decisions = []
        for p in p_list:
            pid = int(p['pid'])
            query_id = CPIDPair((cid, pid))

            if query_id in score_d:
                score = score_d[query_id]
            else:
                score = 0

            binary = 1 if score > 0.5 else 0
            decisions.append((cid, pid, binary))

        return cid, decisions

    predictions = lmap(make_decisions, candidates)
    return predictions
Пример #16
0
def stats():
    entries = list(read())
    print("Total items", len(entries))

    unique = set()
    for e in entries:
        unique.add((e.doc_id, e.part_idx))

    print("Unique passages", len(unique))

    avg_value_list = lmap(lambda x: x.avg_value, entries)
    predicted_score_list = lmap(lambda x: x.predicted_score, entries)

    good_doc = set()
    for e in entries:
        if e.predicted_score > 0.9:
            good_doc.add(e.doc_id)

    r = get_correlation(avg_value_list, predicted_score_list)
    print(r)

    over_09 = lfilter(lambda x: x.predicted_score > 0.9, entries)
    under_01 = lfilter(lambda x: x.predicted_score < 0.1, entries)
    doc_over_09 = lfilter(lambda x: x.doc_id in good_doc, entries)
    doc_over_09_under_01 = lfilter(lambda x: x.doc_id in good_doc, under_01)

    def is_good(x):
        return x.avg_value > 0.01

    def is_bad(x):
        return x.avg_value < -0.01

    for criteria in [is_good, is_bad]:
        good_global = lfilter(criteria, entries)
        good_over_09 = lfilter(criteria, over_09)
        good_under_01 = lfilter(criteria, under_01)
        good_doc_over_09 = lfilter(criteria, doc_over_09)
        good_doc_over_09_under_01 = lfilter(criteria, doc_over_09_under_01)

        job = criteria.__name__
        print("global {} rate".format(job),
              get_rate_str(len(good_global), len(entries)))
        print("over 09 {} rate".format(job),
              get_rate_str(len(good_over_09), len(over_09)))
        print("under 01 {} rate".format(job),
              get_rate_str(len(good_under_01), len(under_01)))
        print("doc over 09 {} rate".format(job),
              get_rate_str(len(good_doc_over_09), len(doc_over_09)))
        print(
            "doc over 09 under 01 {} rate".format(job),
            get_rate_str(len(good_doc_over_09_under_01),
                         len(doc_over_09_under_01)))
Пример #17
0
def paragraph_scorer(idf_fn: Callable[[str], float], q_terms: Set[str],
                     paragraph: List[str]) -> float:
    paragraph_terms = set(paragraph)
    mentioned_terms = lfilter(lambda x: x in paragraph_terms, q_terms)
    mentioned_terms = re_tokenize(mentioned_terms)

    score = sum(lmap(idf_fn, mentioned_terms))
    return score
Пример #18
0
def a_relevant(save_name, q_res_path, claims):
    top_n = 10

    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords = load_stopwords_for_query()
    alpha = 0.5

    tokenizer = PCTokenizer()
    all_passages = []
    entries = []
    num_pos_sum = 0
    num_pos_exists = 0

    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        claim_text = c['text']
        claim_tokens = tokenizer.tokenize_stem(claim_text)

        scores = []
        for t in claim_tokens:
            if t in log_odd:
                scores.append(log_odd[t])

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        passages = iterate_passages(q_res, top_n, get_passage_score)
        num_pos = len(lfilter(lambda x: x[1] > 0, passages))
        num_pos_sum += num_pos
        if num_pos > 0:
            num_pos_exists += 1

        all_passages.extend(passages)
        entries.append((c, passages))

    print("{} claims. {} docs on {} claims".format(len(claims), num_pos_sum,
                                                   num_pos_exists))

    data = entries, all_passages

    save_to_pickle(data, save_name)
Пример #19
0
    def generate_instances(self, claim: Dict,
                           data_id_manager) -> List[PairedInstance]:
        cid = claim['cId']
        perspective_clusters: List[List[int]] = self.gold[cid]

        passages = self.cid_to_passages[cid]
        gold_candidate_texts: List[str] = flatten_map(perspective_getter,
                                                      perspective_clusters)

        good_passages: List[List[str]] = left(
            lfilter(score_over_zero, passages))
        not_good_passages: List[List[str]] = left(
            lfilter_not(score_over_zero, passages))

        # print("good/not_good passages : {}/{}".format(len(good_passages), len(not_good_passages)))

        # make good vs not_good pairs
        # about 100 items
        pair_list_g_ng: List[Tuple[
            List[str], List[str]]] = generate_pairwise_combinations(
                not_good_passages, good_passages, True)
        # make not_good vs random pairs
        # about 100 items
        pair_list_ng_rand: List[Tuple[List[str], List[str]]] = list([
            (inst, self.random_sample(cid)) for inst in not_good_passages
        ])

        # generate (candiate_texts) X (two pair_list), while limit maximum to 5  * len(two pair_list) = 1000
        max_insts = 100 * 2 * 5

        def infinite_passage_iterator():
            while True:
                for pair in pair_list_g_ng:
                    strict_good = 1
                    strict_bad = 0
                    yield pair, strict_good, strict_bad
                for pair in pair_list_ng_rand:
                    strict_good = 0
                    strict_bad = 1
                    yield pair, strict_good, strict_bad

        itr = infinite_passage_iterator()
        all_passage_pair_len = len(pair_list_g_ng) + len(pair_list_ng_rand)
        n_passage_per_inst = int(max_insts / len(gold_candidate_texts)) + 1
        n_passage_per_inst = min(all_passage_pair_len, n_passage_per_inst)

        all_insts = []
        for candidate in gold_candidate_texts:
            for _ in range(n_passage_per_inst):
                passage_pair, strict_good, strict_bad = itr.__next__()
                passage_good, passage_worse = passage_pair
                insts = PairedInstance(passage_good, passage_worse, candidate,
                                       strict_good, strict_bad)
                all_insts.append(insts)
        return all_insts
Пример #20
0
def a_relevant():
    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claims = claims
    top_n = 10
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords = load_stopwords_for_query()
    alpha = 0.3

    tokenizer = PCTokenizer()
    all_passages = []
    entries = []
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        claim_text = c['text']
        claim_tokens = tokenizer.tokenize_stem(claim_text)

        scores = []
        for t in claim_tokens:
            if t in log_odd:
                scores.append(log_odd[t])
        base = average(scores)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        passages = iterate_passages(q_res, top_n, get_passage_score)

        all_passages.extend(passages)
        a_rel_passages = lfilter(lambda x: x[1] > 0, passages)

        entries.append((c, a_rel_passages))

    data = entries, all_passages

    save_to_pickle(data, "pc_train_a_passages")
Пример #21
0
    def filter_map(qk_unit: QKUnit):
        query, kdp_list = qk_unit
        good_doc_list = good_doc_list_d[query.query_id]

        def is_good(kdp):
            return kdp.doc_id in good_doc_list

        new_kdp_list = lfilter(is_good, kdp_list)
        print("{} -> {}".format(len(kdp_list), len(new_kdp_list)))
        if not new_kdp_list:
            stat_count["no kdp"] += 1
        return query, new_kdp_list
Пример #22
0
def get_ap_list_from_score_d(score_d, split):
    candidates: List[Tuple[int, List[Dict]]] = get_eval_candidates_from_pickle(
        split)
    # only evalaute what's available
    valid_cids: Set[int] = set(left(score_d.keys()))
    sub_candidates: List[Tuple[int, List[Dict]]] = lfilter(
        lambda x: x[0] in valid_cids, candidates)
    print("{} claims are evaluated".format(len(sub_candidates)))
    predictions = predict_from_dict(score_d, sub_candidates, 50)
    cids = left(predictions)
    ap_list = get_average_precision_list(predictions, False)
    return ap_list, cids
Пример #23
0
def eval_map(split, score_d: Dict[CPIDPair, float], debug=False):
    # load pre-computed perspectives
    candidates: List[Tuple[int, List[Dict]]] = get_eval_candidates_from_pickle(
        split)
    # only evalaute what's available
    valid_cids: Set[int] = set(left(score_d.keys()))
    sub_candidates: List[Tuple[int, List[Dict]]] = lfilter(
        lambda x: x[0] in valid_cids, candidates)
    print("{} claims are evaluated".format(len(sub_candidates)))
    print(left(sub_candidates))
    predictions = predict_from_dict(score_d, sub_candidates, 50)
    return evaluate_map(predictions, debug)
Пример #24
0
    def work(self, job_id):
        features: List[ParagraphFeature] = pickle.load(
            open(os.path.join(self.input_dir, str(job_id)), "rb"))

        def include(f: ParagraphFeature) -> bool:
            return f.datapoint.id in self.dp_id_set

        features: List[ParagraphFeature] = lfilter(include, features)

        if features:
            self.write(features, job_id)
        else:
            print("No features")
Пример #25
0
def filter_with_ranked_list(
    qk_untis: List[QKUnit],
    ranked_list_d: Dict[str, List[TrecRankedListEntry]],
    threshold,
    top_k,
) -> List[QKUnit]:

    out_qk_units = []
    for q, k_list in qk_untis:
        try:
            cur_ranked_list = ranked_list_d[q.query_id]
            entries: Dict[str, TrecRankedListEntry] = {
                e.doc_id: e
                for e in cur_ranked_list
            }
            n_k_list = len(k_list)

            not_found_set = set()

            def get_score(k: KDP):
                key = k.to_str()
                if key in entries:
                    s: TrecRankedListEntry = entries[key]
                    return s.score
                else:
                    not_found_set.add(key)
                    return -1e10

            k_list.sort(key=get_score, reverse=True)

            def higher(k: KDP) -> bool:
                return get_score(k) >= threshold

            if threshold is not None:
                k_list = lfilter(higher, k_list)

            if top_k is None or top_k == -1:
                pass
            else:
                k_list = k_list[:top_k]
            out_qk_units.append((q, k_list))
            if not_found_set:
                print("For query {}, {} of {} do not have score".format(
                    q.query_id, len(not_found_set), n_k_list))
        except KeyError as e:
            print(e, "KeyError", q.query_id)

    print(lmap(len, right(out_qk_units)))
    return out_qk_units
Пример #26
0
        def paragraph_scorer(paragraph: Paragraph) -> ScoreParagraph:
            paragraph_terms = set(paragraph.tokens)
            mentioned_terms = lfilter(lambda x: x in paragraph_terms, cp_tokens)
            mentioned_terms = re_tokenize(mentioned_terms)

            def idf(term: str):
                if term not in clue12_13_df:
                    if term in string.printable:
                        return 0
                    not_found_set.add(term)

                return math.log((cdf+0.5)/(clue12_13_df[term]+0.5))

            score = sum(lmap(idf, mentioned_terms))
            max_score = sum(lmap(idf, cp_tokens))
            return ScoreParagraph(paragraph=paragraph, score=score)
Пример #27
0
def featurize_fn(voca, voca2idx, datapoint):
    rm_list, label = datapoint
    nonzero = lfilter(lambda x: x > 0, right(rm_list))
    if nonzero:
        nonzero_min = min(nonzero)
    else:
        nonzero_min = 0

    terms = left(rm_list)
    term_ids = lmap(lambda x: voca2idx[x], terms)
    scores = list([s if s > 0 else 0.2 * nonzero_min for s in right(rm_list)])

    v = np.zeros([len(voca)])
    for idx, score in zip(term_ids, scores):
        v[idx] = score
    return v, label
Пример #28
0
    def idf_scorer(doc: Counter, claim_text: str, perspective_text: str) -> bool:
        cp_tokens = nltk.word_tokenize(claim_text) + nltk.word_tokenize(perspective_text)
        cp_tokens = lmap(lambda x: x.lower(), cp_tokens)
        cp_tokens = set(cp_tokens)
        mentioned_terms = lfilter(lambda x: x in doc, cp_tokens)
        mentioned_terms = re_tokenize(mentioned_terms)

        def idf(term: str):
            if term not in clue12_13_df:
                if term in string.printable:
                    return 0
                not_found_set.add(term)

            return math.log((cdf+0.5)/(clue12_13_df[term]+0.5))

        score = sum(lmap(idf, mentioned_terms))
        max_score = sum(lmap(idf, cp_tokens))
        return score > max_score * 0.8
Пример #29
0
def collect_good_passages(data_id_to_info: Dict[str, Dict],
                          passage_score_path: FilePath,
                          config: Dict
                          ) -> List[Tuple[str, List[QKOutEntry]]]:
    global recover_subtokens
    recover_subtokens = get_recover_subtokens()
    score_cut = config['score_cut']
    top_k = config['top_k']
    score_type = config['score_type']
    fetch_field_list = ["logits", "input_ids", "data_id"]
    data: List[Dict] = join_prediction_with_info(passage_score_path,
                                                 data_id_to_info,
                                                 fetch_field_list
                                                 )
    qk_out_entries: List[QKOutEntry] = lmap(QKOutEntry.from_dict, data)

    grouped: Dict[str, List[QKOutEntry]] = group_by(qk_out_entries, lambda x: x.query.query_id)

    def get_score_from_logit_local(logits) -> float:
        return get_score_from_logit(score_type, logits)

    def get_score(entry: QKOutEntry):
        return get_score_from_logit_local(entry.logits)

    def is_good(qk_out_entry: QKOutEntry):
        score = get_score_from_logit_local(qk_out_entry.logits)
        return score >= score_cut

    output = []
    num_passges = []
    for cid, passages in grouped.items():
        good_passages = lfilter(is_good, passages)
        good_passages.sort(key=get_score, reverse=True)
        num_passges.append(len(good_passages))
        if good_passages:
            output.append((cid, good_passages[:top_k]))
        else:
            scores = lmap(get_score, passages)
            scores.sort(reverse=True)

    print(num_passges)
    print("{} of {} query has passages".format(len(output), len(grouped)))
    return output
Пример #30
0
def main():
    qk_list = load_from_pickle("pc_evidence_qk")
    split = "train"
    split = "dev"
    tprint("Building query lms")
    query_lms = get_query_lms(split)
    split_query_ids = list(query_lms.keys())

    def is_split(qk: QKUnit):
        q, k = qk
        if q.query_id in split_query_ids:
            return True
        else:
            return False

    qk_for_split = lfilter(is_split, qk_list)
    tprint("start filtering")
    filtered_qk = filter_qk(qk_for_split, query_lms)

    save_to_pickle(filtered_qk, "pc_evi_filtered_qk_{}".format(split))