示例#1
0
def main():
    judgment_path = sys.argv[1]
    metric = sys.argv[2]
    ranked_list_path1 = sys.argv[3]
    ranked_list_path2 = sys.argv[4]
    # print
    qrels = load_qrels_flat(judgment_path)

    ranked_list_1: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped(ranked_list_path1)
    ranked_list_2: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped(ranked_list_path2)

    metric_fn = get_metric_fn(metric)

    score_d1 = get_score_per_query(qrels, metric_fn, ranked_list_1)
    score_d2 = get_score_per_query(qrels, metric_fn, ranked_list_2)

    pairs = []
    for key in score_d1:
        try:
            e = (score_d1[key], score_d2[key])
            pairs.append(e)
        except KeyError as e:
            pass

    if len(pairs) < len(score_d1) or len(pairs) < len(score_d2):
        print("{} matched from {} and {} scores".format(len(pairs), len(score_d1), len(score_d2)))

    l1, l2 = zip(*pairs)
    d, p_value = stats.ttest_rel(l1, l2)
    print("baseline:", average(l1))
    print("treatment:", average(l2))
    print(d, p_value)
示例#2
0
def per_doc_score():
    filename = "tlm_view.pickle"
    html_writer = HtmlVisualizer("per_doc_score.html", dark_mode=False)

    data = EstimatorPredictionViewerGosford(filename)
    amp = 20
    small_threshold = 40
    for inst_i, entry in enumerate(data):
        if inst_i > 1000:
            break
        scores = entry.get_vector("priority_score")

        tokens = entry.get_mask_resolved_input_mask_with_input()
        cells = data.cells_from_tokens(tokens)
        if len(cells) < small_threshold:
            continue
        avg_score = average(scores)
        if -0.11 > avg_score > -0.30:
            continue
        print(average(scores))
        html_writer.write_headline(avg_score)
        rows = []
        row = []
        for idx, cell in enumerate(cells):
            row.append(cell)
            if len(row) == 20:
                html_writer.write_table([row])
                row = []
示例#3
0
 def avg_fn(l):
     r = average(l)
     cnt = 0
     for t in l:
         if abs(t - r) > 0.5:
             cnt += 1
     print(l)
     return average(l)
示例#4
0
def main():
    score_path1 = sys.argv[1]
    score_path2 = sys.argv[2]
    # print
    l1 = get_score_per_query(score_path1)
    l2 = get_score_per_query(score_path2)

    assert len(l1) == len(l2)

    d, p_value = stats.ttest_rel(l1, l2)
    print("baseline:", average(l1))
    print("treatment:", average(l2))
    print(d, p_value)
示例#5
0
    def valid_fn():
        loss_list = []
        acc_list = []
        for batch in dev_batches:
            loss_val, acc, g_step_val = sess.run(
                [task.loss, task.acc, global_step],
                feed_dict=batch2feed_dict(batch))
            loss_list.append(loss_val)
            acc_list.append(acc)
        log.info("Step dev step={0} loss={1:.04f} acc={2:.03f}".format(
            g_step_val, average(loss_list), average(acc_list)))

        return average(acc_list)
示例#6
0
def sanity_check():
    dvp: List[DocValueParts2] = load()
    candidate_d_raw: List[Tuple[int, List[int]]] = get_eval_candidate_as_pids(
        "train")
    candidate_d = {str(k): lmap(str, v) for k, v in candidate_d_raw}

    # Group by doc id
    dvp_qid_grouped: Dict[str, List[DocValueParts2]] = group_by(dvp, get_qid)

    ap_baseline = []
    ap_new_score = []
    for qid, entries in dvp_qid_grouped.items():
        ranked_list_new = []
        ranked_list_baseline = []

        candidate_id_grouped = group_by(entries, get_candidate)
        for candidate_id, entries2 in candidate_id_grouped.items():
            is_initial_candidate = candidate_id in candidate_d[qid]
            gold = entries2[0].label
            skip = gold and not is_initial_candidate

            def get_new_score(dvp: DocValueParts2):
                return dvp.score

            def get_baseline_score(dvp: DocValueParts2):
                return dvp.init_score

            if skip:
                continue

            new_score = top_k_avg(lmap(get_new_score, entries2))
            baseline_score = average(lmap(get_baseline_score, entries2))
            ranked_list_new.append((candidate_id, new_score, gold))
            ranked_list_baseline.append((candidate_id, baseline_score, gold))

        def get_ap(ranked_list):
            ranked_list.sort(key=lambda x: x[1], reverse=True)

            p_list = []
            p = 0
            for rank, (cid, score, gold) in enumerate(ranked_list):
                if gold:
                    p += 1
                    p_list.append(p / (rank + 1))
            return average(p_list)

        ap_baseline.append(get_ap(ranked_list_baseline))
        ap_new_score.append(get_ap(ranked_list_new))

    print("MAP baseline", average(ap_baseline))
    print("MAP new score", average(ap_new_score))
示例#7
0
def valid_fn_factory(sess, dev_batches, loss_tensor, acc_tensor,
                     global_step_tensor, batch2feed_dict):
    loss_list = []
    acc_list = []
    for batch in dev_batches:
        loss_val, acc, g_step_val = sess.run(
            [loss_tensor, acc_tensor, global_step_tensor],
            feed_dict=batch2feed_dict(batch))
        loss_list.append(loss_val)
        acc_list.append(acc)
    tf_logging.info("Step dev step={0} loss={1:.04f} acc={2:.03f}".format(
        g_step_val, average(loss_list), average(acc_list)))

    return average(acc_list)
示例#8
0
文件: eval.py 项目: clover3/Chair
def get_precision_recall(
        input_entries: List[Tuple[QCKQuery, List[QCKCandidate]]]) -> Dict:
    gold_dict: Dict[str, List[int]] = evidence_gold_dict_str_qid()

    all_scores = []
    for query, ranked_list in input_entries:
        e_id_list = lmap(QCKCandidate.get_id, ranked_list)
        gold_id = gold_dict[query.query_id]

        tp = 0
        for e_id in e_id_list:
            if e_id in gold_id:
                tp += 1

        precision = tp / len(e_id_list) if len(e_id_list) else 1
        recall = tp / len(gold_id) if len(gold_id) else 1
        f1 = get_f1(precision, recall)
        per_score = {
            'precision': precision,
            'recall': recall,
        }
        all_scores.append(per_score)

    average_scores = {}
    for metric in ['precision', 'recall']:
        average_scores[metric] = average([e[metric] for e in all_scores])

    average_scores['f1'] = get_f1(average_scores['precision'],
                                  average_scores['recall'])
    return average_scores
示例#9
0
def main():
    info_d = {}
    for job_id in range(5):
        p = os.path.join(cpath.data_path, "tlm", "pred",
                         "info_d_{}.pickle".format(job_id))
        d = pickle.load(open(p, "rb"))
        info_d.update(d)

    p = os.path.join(cpath.data_path, "tlm", "pred", "tlm1.pickle")
    pred = pickle.load(open(p, "rb"))

    p_l = list([list() for i in range(5)])

    tf_id_set = set()

    for e in pred:
        tf_id = info_d[e.unique_ids]
        if tf_id not in tf_id_set:
            tf_id_set.add(tf_id)
            loss = e.losses
            print(tf_id, e.unique_ids, loss)
            j = e.unique_ids % 10
            p_l[j].append(loss)

    for i in range(5):
        print("Type : {} : {}".format(i, average(p_l[i])))
示例#10
0
def debug_failture(predictions):
    gold = get_claim_perspective_id_dict()
    ap_list = []
    for c_Id, prediction_list in predictions:
        gold_pids = gold[c_Id]
        gold_pids_set: Set[int] = set(flatten(gold_pids))
        claim_text = prediction_list[0]['claim_text']
        print("Claim {}: ".format(c_Id), claim_text)
        correctness_list = lmap(lambda p: p['pid'] in gold_pids_set,
                                prediction_list)
        ap = get_ap(prediction_list, gold_pids, False)

        if not any(correctness_list):  # all wrong
            continue

        if ap > 0.9:
            continue

        def print_line(prediction):
            pid = prediction['pid']
            correct = pid in gold_pids_set
            if correct:
                correct_str = "Y"
            else:
                correct_str = "N"

            score = prediction['score']
            print(correct_str, score, score.name,
                  prediction['perspective_text'])

        foreach(print_line, prediction_list)
        ap_list.append(ap)

    map = average(ap_list)
    return {'map': map}
示例#11
0
def train_test_repeat(load_id, exp_name, n_repeat):
    hp = hyperparams.HPBert()
    e_config = ExperimentConfig()
    e_config.name = "RTE_{}".format("A")
    e_config.num_epoch = 10
    e_config.save_interval = 30 * 60  # 30 minutes
    e_config.load_names = ['bert']
    vocab_filename = "bert_voca.txt"
    data_loader = rte.DataLoader(hp.seq_max, vocab_filename, True)

    print(load_id)
    scores = []
    for i in range(n_repeat):
        e = Experiment(hp)
        print(exp_name)
        e_config.name = "rte_{}".format(exp_name)
        save_path = e.train_rte(e_config, data_loader, load_id)
        acc = e.eval_rte(e_config, data_loader, save_path)
        scores.append(acc)
    print(exp_name)
    for e in scores:
        print(e, end="\t")
    print()
    r = average(scores)
    print("Avg\n{0:.03f}".format(r))
    return r
示例#12
0
def get_ap(predicted_perspectives, gold_pids, debug):
    ## In this metrics, it is possible to get precision > 1, as some clusters shares same perspective
    # if debug:
    #     print(gold_pids)
    # for cluster in gold_pids:
    #     print("-")
    #     for pid in cluster:
    #         print(pid, perspective_getter(pid))
    def is_correct(pid):
        for cluster in gold_pids:
            if pid in cluster:
                return True
        return False

    tp = 0
    precision_list = []
    for idx, prediction in enumerate(predicted_perspectives):
        pid = prediction['pid']
        if is_correct(pid):
            tp += 1
            n_pred = idx + 1
            prec = tp / n_pred
            precision_list.append(prec)
            correct_str = "Y"
        else:
            correct_str = "N"

        if debug:
            print(correct_str, prediction['score'], prediction['rationale'],
                  pid, prediction['perspective_text'])
    assert tp == len(precision_list)
    ap = average(precision_list) if tp > 0 else 1
    return ap
示例#13
0
def main():
    judgment_path = sys.argv[1]
    ranked_list_path = sys.argv[2]
    metric = sys.argv[3]

    qrels = load_qrels_flat_per_query(judgment_path)
    ranked_list: Dict[str,
                      List[TrecRankedListEntry]] = load_ranked_list_grouped(
                          ranked_list_path)

    metric_fn = get_metric_fn(metric)

    score_per_query_list = []
    not_found = 0
    for query_id in ranked_list:
        q_ranked_list = ranked_list[query_id]

        try:
            gold_list = qrels[query_id]
            true_gold = list(
                [doc_id for doc_id, score in gold_list if score > 0])
            score_per_query = metric_fn(q_ranked_list, true_gold)
            score_per_query_list.append(score_per_query)
        except KeyError as e:
            not_found += 1

    if not_found:
        print("{} of {} queires not found".format(not_found, len(ranked_list)))

    score = average(score_per_query_list)
    print("{}\t{}".format(metric, score))
示例#14
0
def extract_qk_unit(info_path, pred_path, config_path) -> Iterable[QKUnit]:
    info = load_combine_info_jsons(info_path, qk_convert_map, False)
    predictions = join_prediction_with_info(pred_path, info)
    grouped: Dict[str, List[Dict]] = group_by(predictions,
                                              lambda x: x['query'].query_id)
    config = json.load(open(config_path, "r"))
    score_cut = config['score_cut']
    top_k = config['top_k']

    def is_good(entry):
        return get_regression_score(entry) > score_cut

    select_rate_list = []
    qk_units = []
    for qid, entries in grouped.items():
        any_entry = entries[0]
        query = any_entry['query']
        good_entries = lfilter(is_good, entries)
        good_entries.sort(key=get_regression_score, reverse=True)
        selected_entries = good_entries[:top_k]
        if not selected_entries:
            continue
        kd_list = lmap(lambda x: x['kdp'], selected_entries)
        qk_units.append((query, kd_list))

        select_rate = len(selected_entries) / len(entries)
        select_rate_list.append(select_rate)

    print("{} of {} qk units selected".format(len(qk_units), len(grouped)))
    print("average select rate", average(select_rate_list))
    return qk_units
示例#15
0
def show(n):
    topic = "abortion"
    count = load_n_gram_from_pickle(topic, n)
    clueweb_tf, clueweb_df = load_subword_term_stat()
    clueweb_idf = df_to_idf(clueweb_df)
    c_tf, nc_tf = load_from_pickle("abortion_clm")


    avg_idf = average(list(clueweb_idf.values()))

    def get_idf(t):
        if t in clueweb_idf:
            return clueweb_idf[t]
        else:
            return avg_idf


    l = list(count.items())
    skip_count = 0
    l.sort(key=lambda x:x[1], reverse=True)
    for n_gram, cnt in l[:1000]:
        if is_single_char_n_gram(n_gram):
            skip_count += 1
        else:
            idf_sum = sum([get_idf(t) for t in n_gram])
            print("{} {}".format(n_gram, cnt) + " {0:.2f} {1:.2f} ".format(idf_sum, cnt * idf_sum))

    print("Skip", skip_count)
示例#16
0
def statistics_tlm():
    filename = "blc_cold_scores.pickle"
    data = EstimatorPredictionViewerGosford(filename)

    bins = {}
    bin_fn = get_bin_fn_from_interval(0, 1.05, 0.05)
    for inst_i, entry in enumerate(data):
        loss1 = entry.get_vector("lm_loss1")
        loss2 = entry.get_vector("lm_loss2")

        prob1 = loss_to_prob(loss1)
        prob2 = loss_to_prob(loss2)
        tokens = entry.get_mask_resolved_input_mask_with_input()

        for i, _ in enumerate(tokens):
            key = bin_fn(prob1[i])
            if key not in bins:
                bins[key] = []
            bins[key].append(prob2[i])

    keys = list([k for k in bins.keys() if not k == "Unidentifed"])
    keys.sort(key=lambda x:x[0])

    mean_dict = {}
    std_dict = {}
    for key in keys:
        l = average(bins[key])
        std = np.std(bins[key])
        mean_dict[key] = l
        std_dict[key] = std
        st, ed = key
        #print("{0:.2f} {1:.2f}".format(st, ed), l)
    return bin_fn, mean_dict, std_dict
示例#17
0
def summarize_score(info: Dict, prediction_file_path: str,
                    f_handler: FormatHandler, combine_score: Callable,
                    score_type) -> Dict[Tuple[str, str], float]:
    key_logit = "logits"
    data: List[Dict] = join_prediction_with_info(prediction_file_path, info,
                                                 ["data_id", key_logit])

    def logit_to_score_softmax(logit):
        return scipy.special.softmax(logit)[1]

    def get_score(entry):
        if score_type == "softmax":
            return logit_to_score_softmax(entry['logits'])
        elif score_type == "raw":
            return entry[key_logit][0]
        elif score_type == "scalar":
            return entry[key_logit]
        elif score_type == "tuple":
            return entry[key_logit][1]
        else:
            assert False

    grouped: Dict[Tuple[str, str],
                  List[Dict]] = group_by(data, f_handler.get_pair_id)
    tprint("Group size:", len(grouped))
    out_d = {}
    for pair_id, items in grouped.items():
        scores = lmap(get_score, items)
        final_score = combine_score(scores)
        out_d[pair_id] = final_score

    num_items_per_group = average(lmap(len, grouped.values()))
    tprint("Num items per group : ", num_items_per_group)
    return out_d
示例#18
0
def view_grad_overlap():
    filename = "gradient_overlap_4K.pickle"

    out_name = filename.split(".")[0] + ".html"
    html_writer = HtmlVisualizer(out_name, dark_mode=False)

    data = EstimatorPredictionViewerGosford(filename)
    iba = IntBinAverage()
    scores = []
    for inst_i, entry in enumerate(data):
        masked_lm_example_loss = entry.get_vector("masked_lm_example_loss")
        score = entry.get_vector("overlap_score")

        if masked_lm_example_loss > 1:
            norm_score = score / masked_lm_example_loss
            iba.add(masked_lm_example_loss, norm_score)
        scores.append(score)

    score_avg = average(scores)
    score_std = np.std(scores)

    avg = iba.all_average()
    std_dict = {}
    for key, values in iba.list_dict.items():
        std_dict[key] = np.std(values)
        if len(values) == 1:
            std_dict[key] = 999

    def unlikeliness(value, mean, std):
        return abs(value - mean) / std

    data = EstimatorPredictionViewerGosford(filename)
    print("num record : ", data.data_len)
    cnt = 0
    for inst_i, entry in enumerate(data):
        tokens = entry.get_mask_resolved_input_mask_with_input()
        masked_lm_example_loss = entry.get_vector("masked_lm_example_loss")
        highlight = lmap(is_mask, tokens)
        score = entry.get_vector("overlap_score")
        print(score)
        cells = data.cells_from_tokens(tokens, highlight)
        if masked_lm_example_loss > 1:
            bin_key = int(masked_lm_example_loss)
            norm_score = score / masked_lm_example_loss
            if norm_score > 5000:
                cnt += 1
            expectation = avg[bin_key]
            if unlikeliness(score, score_avg, score_std) > 2 or True:
                html_writer.multirow_print(cells, 20)
                if norm_score > expectation:
                    html_writer.write_paragraph("High")
                else:
                    html_writer.write_paragraph("Low")
                html_writer.write_paragraph("Norm score: " + str(norm_score))
                html_writer.write_paragraph("score: " + str(score))
                html_writer.write_paragraph("masked_lm_example_loss: " +
                                            str(masked_lm_example_loss))
                html_writer.write_paragraph("expectation: " + str(expectation))
    print("number over 5000: ", cnt)
示例#19
0
def main():
    relevance_scores: Dict[CPIDPair, List[Tuple[Logits, Logits]]] = load_from_pickle("pc_relevance_score")
    gold = get_claim_perspective_id_dict()

    true_feature = []
    false_feature = []

    ticker = TimeEstimator(len(relevance_scores))
    for key in relevance_scores:
        ticker.tick()
        cid, pid = key

        gold_pids = flatten(gold[cid])
        gold_pids = list([int(pid) for pid in gold_pids])
        correct = pid in gold_pids
        scores: List[Tuple[List[float], List[float]]] = relevance_scores[key]

        c_count = 0
        p_count = 0
        pc_count = 0
        for c_logits, p_logits in scores:
            c_rel = softmax(c_logits)[1] > 0.5
            p_rel = softmax(p_logits)[1] > 0.5

            c_count += int(c_rel)
            p_count += int(p_rel)
            pc_count += int(c_rel and p_rel)

        if correct:
            true_feature.append(pc_count)
        else:
            false_feature.append(pc_count)

    all_feature = true_feature + false_feature
    all_feature.sort()
    mid = int(len(all_feature)/2)
    cut_off = all_feature[mid]

    tp = sum([int(t > cut_off) for t in true_feature])
    fp = sum([int(t > cut_off) for t in false_feature])
    tn = sum([int(t <= cut_off) for t in false_feature])
    fn = sum([int(t <= cut_off) for t in true_feature])

    print(tp, fp, tn, fn)
    print("true feature", average(true_feature))
    print("false feature", average(false_feature))
示例#20
0
def text_len():
    d = get_tokens()
    data = list([list() for _ in range(5)])
    for key, tokens in d:
        data[key].append(len(pretty_tokens(tokens)))

    for i in range(5):
        print(i, average(data[i]))
示例#21
0
def a_relevant():
    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    claims = claims[:10]
    top_n = 100
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)

    stopwords = load_stopwords_for_query()
    alpha = 0.7

    tokenizer = PCTokenizer()
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        docs = []
        for i in range(top_n):
            try:
                doc = load_doc(q_res[i].doc_id)
                docs.append(doc)
            except KeyError:
                docs.append(None)
                pass

        print(c['text'])
        rows = []
        for rank, doc in enumerate(docs):
            if doc is None:
                rows.append((rank, "-", "-"))
                continue

            scores = get_doc_score(doc, get_passage_score)
            avg_score = average(scores)
            max_score = max(scores)
            rows.append((rank, avg_score, max_score))

        print_table(rows)
示例#22
0
def main():
    claim_text_d: Dict[int, str] = get_all_claim_d()
    claim_text_d: Dict[str, str] = dict_key_map(str, claim_text_d)
    evi_dict: Dict[str, str] = dict_key_map(str, load_evidence_dict())
    evi_gold_dict: Dict[str, List[int]] = evidence_gold_dict_str_qid()
    print("V2")

    def print_entry(entry):
        evidence_text = evi_dict[entry.doc_id]
        print("[{}] {}: {}".format(entry.rank, entry.doc_id, evidence_text))

    ranked_list_dict = load_ranked_list_grouped(sys.argv[1])
    for query, ranked_list in ranked_list_dict.items():
        print()

        claim_id, perspective_id = query.split("_")
        gold_ids: List[str] = lmap(str, evi_gold_dict[query])
        if not gold_ids:
            print("query {} has no gold".format(query))
            continue
        assert gold_ids
        claim_text = claim_text_d[claim_id]
        perspective_text = perspective_getter(int(perspective_id))

        pos_entries = []
        neg_entries = []
        for entry in ranked_list:
            label = entry.doc_id in gold_ids
            if label:
                pos_entries.append(entry)
            elif entry.rank < 3:
                neg_entries.append(entry)

        if not pos_entries:
            print("gold not in ranked list")
            continue

        num_rel = len(pos_entries)

        correctness = []
        for entry in ranked_list[:num_rel]:
            label = entry.doc_id in gold_ids
            correctness.append(int(label))

        precision = average(correctness)
        if precision > 0.99:
            print("Good")
            continue
        print("precision at {}: {}".format(num_rel, precision))

        print("Claim: ", claim_text)
        print("perspective_text: ", perspective_text)
        print(" < GOLD >")
        foreach(print_entry, pos_entries)
        print(" < False Positive >")
        foreach(print_entry, neg_entries)
示例#23
0
文件: eval.py 项目: clover3/Chair
def run_eval_threaded(split, predictor_getter):
    print("Loading data..")
    problems: List[ArguDataPoint] = list(load_labeled_data(split))
    payload: List[Passage] = get_eval_payload_from_dp(problems)
    print("starting predictions")
    predictions = parallel_run(payload, (split, predictor_getter), eval_thread,
                               5)
    correctness = eval_correctness(predictions, problems)
    avg_p_at_1 = average(correctness)
    print(avg_p_at_1)
示例#24
0
        def get_ap(ranked_list):
            ranked_list.sort(key=lambda x: x[1], reverse=True)

            p_list = []
            p = 0
            for rank, (cid, score, gold) in enumerate(ranked_list):
                if gold:
                    p += 1
                    p_list.append(p / (rank + 1))
            return average(p_list)
示例#25
0
def a_relevant():
    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claims = claims
    top_n = 10
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords = load_stopwords_for_query()
    alpha = 0.3

    tokenizer = PCTokenizer()
    all_passages = []
    entries = []
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        claim_text = c['text']
        claim_tokens = tokenizer.tokenize_stem(claim_text)

        scores = []
        for t in claim_tokens:
            if t in log_odd:
                scores.append(log_odd[t])
        base = average(scores)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        passages = iterate_passages(q_res, top_n, get_passage_score)

        all_passages.extend(passages)
        a_rel_passages = lfilter(lambda x: x[1] > 0, passages)

        entries.append((c, a_rel_passages))

    data = entries, all_passages

    save_to_pickle(data, "pc_train_a_passages")
示例#26
0
def view():
    filename = os.path.join(output_path, "nli_dev_loss.pickle")
    data = EstimatorPredictionViewerGosford(filename)

    loss_arr = []
    for inst_i, entry in enumerate(data):
        t = entry.get_vector("loss")
        loss_arr.append(float(t))

    print(len(loss_arr))
    print("avg:", average(loss_arr))
示例#27
0
def perspective_lm_correlation():
    d_ids = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 20
    gold = get_claim_perspective_id_dict()
    predictions = predict_with_lm(claims, top_k)

    avg_pos_list = []
    avg_neg_list = []
    for c_Id, prediction_list in predictions:
        gold_pids = gold[c_Id]
        claim_text = prediction_list[0]['claim_text']

        pos_list = []
        neg_list = []
        print("Claim {}: ".format(c_Id), claim_text)
        for prediction in prediction_list:
            pid = prediction['pid']
            valid = False
            for cluster in gold_pids:
                if pid in cluster:
                    valid = True
                    break
            print("{0} {1:.2f} {2}".format(valid, prediction['lm_score'],
                                           prediction['perspective_text']))
            if not valid:
                neg_list.append(prediction['lm_score'])
            else:
                pos_list.append(prediction['lm_score'])

        if pos_list and neg_list:
            pos_score = average(pos_list)
            neg_score = average(neg_list)
            avg_pos_list.append(pos_score)
            avg_neg_list.append(neg_score)

    diff, p = ttest_ind(avg_pos_list, avg_neg_list)
    print("pos", average(avg_pos_list), "neg", average(avg_neg_list))
    print("pos", avg_pos_list)
    print("neg", avg_neg_list)
    print(diff, p)
示例#28
0
    def generate(
            self, query_list, data_id_manager: DataIDManager
    ) -> List[ClassificationInstanceWDataID]:
        neg_k = 1000
        all_insts = []
        pos_n_segment = []
        neg_n_segment = []
        for query_id in query_list:
            if query_id not in self.judgement:
                continue

            judgement = self.judgement[query_id]
            query = self.queries[query_id]
            query_tokens = self.tokenizer.tokenize(query)

            ranked_list = self.galago_rank[query_id]
            ranked_list = ranked_list[:neg_k]

            target_docs = set(judgement.keys())
            target_docs.update([e.doc_id for e in ranked_list])
            print("Total of {} docs".format(len(target_docs)))

            for doc_id in target_docs:
                tokens = self.data[doc_id]
                insts: List[Tuple[List, List]] = self.encoder.encode(
                    query_tokens, tokens)
                label = 1 if doc_id in judgement and judgement[
                    doc_id] > 0 else 0
                target_indices = self.target_selection_fn(
                    query_id, doc_id, insts)
                n_segment = len(target_indices)
                if label:
                    pos_n_segment.append(n_segment)
                else:
                    neg_n_segment.append(n_segment)

        print("num pos docs: ", len(pos_n_segment))
        print("num neg docs: ", len(neg_n_segment))
        print("avg n_seg per doc [pos]", average(pos_n_segment))
        print("avg n_seg per doc [neg]", average(neg_n_segment))
        return all_insts
示例#29
0
def average_likelihood(sent, fn_get_prob):
    log_p = 0
    for i, w in enumerate(sent):
        p_list = []
        for j, w2 in enumerate(sent):
            if i is not j:
                p = fn_get_prob(w, w2)
                p_list.append(p)

        avg_p = average(p_list)
        log_p += math.log(avg_p)
    return math.exp(log_p)
示例#30
0
def evaluate(predictions, debug=True):
    gold = get_claim_perspective_id_dict()
    prec_list = []
    recall_list = []
    for c_Id, prediction_list in predictions:
        gold_pids = gold[c_Id]
        claim_text = prediction_list[0]['claim_text']
        if debug:
            print("Claim {}: ".format(c_Id), claim_text)
        prec, recall = get_prec_recll(prediction_list, gold_pids, debug)
        prec_list.append(prec)
        recall_list.append(recall)

    avg_prec = average(prec_list)
    avg_recall = average(recall_list)

    return {
        'precision': avg_prec,
        'recall': avg_recall,
        'f1': get_f1(avg_prec, avg_recall)
    }