Пример #1
0
def load_data(data_name):
    print("reading data")
    if data_name == "run_clueweb":
        path_list = clueweb_pair_prediction_list()
        data, data_len = combine_prediction_list(path_list)
        print("total of {} data".format(data_len))
        pickle_path = get_clueweb_out_pickle_path()
    elif data_name == "hydroponics":
        path_list = hydroponics_prediction_path_list()
        data, data_len = combine_prediction_list(path_list)
        pickle_path = os.path.join(output_path, data_name)
    elif data_name == "weather":
        path_list = weather_prediction_path_list()
        data, data_len = combine_prediction_list(path_list)
        pickle_path = os.path.join(output_path, data_name)
    elif data_name == "first_abortion":
        data = EstimatorPredictionViewer(os.path.join(output_path, "nli_prediction"))
        print("total of {} data".format(data.data_len))
        pickle_path = os.path.join(output_path, "abortion_nli_prediction_analysis")
    elif data_name == "rerun_abortion":
        data = EstimatorPredictionViewer(os.path.join(output_path, "abortion_contradiction"))
        print("total of {} data".format(data.data_len))
        pickle_path = os.path.join(output_path, "abortion_nli_prediction_analysis2")
    else:
        data = EstimatorPredictionViewer(os.path.join(output_path, data_name))
        print("total of {} data".format(data.data_len))
        pickle_path = os.path.join(output_path, data_name + "_analysis")
    return data, pickle_path
Пример #2
0
def collect_data_w_cpid(prediction_file, info: Dict, logit_to_score) \
        -> List[Dict]:
    data = EstimatorPredictionViewer(prediction_file)
    print("Num data ", data.data_len)
    out = []
    for entry in data:
        logits = entry.get_vector("logits")
        score = logit_to_score(logits)
        data_id = entry.get_vector("data_id")[0]
        confidence = get_confidence_or_rel_score(entry)
        try:
            cur_info = info[str(data_id)]
            cid = cur_info['cid']
            pid = cur_info['pid']

            cpid = CPIDPair((cid, pid))
            cur_info['cpid'] = cpid
            cur_info['score'] = score
            cur_info['confidence'] = confidence
            out.append(cur_info)
        except KeyError as e:
            print("Key error")
            print("data_id", data_id)
            pass
    return out
Пример #3
0
def read_passage_scores(prediction_file,
                        info: Dict,
                        recover_subtokens
                        ) \
        -> Dict[int, List[Dict]] :
    data = EstimatorPredictionViewer(prediction_file)
    print("Num data ", data.data_len)
    output: Dict[int, List] = defaultdict(list)
    fail_cnt =0
    for entry in data:
        logits = entry.get_vector("logits")
        data_id = entry.get_vector("data_id")[0]
        try:
            cur_info = info[int(data_id)]
            cid = cur_info['cid']
            d = {
                'cid': cid,
                'passage': recover_subtokens(entry.get_vector("input_ids")),
                'logits': logits,
                'data_id': data_id,
            }
            output[cid].append(d)
        except KeyError as e:
            print("Key error")
            print("data_id", data_id)
            fail_cnt += 1
            if fail_cnt > 100:
                raise Exception()
            pass
    return output
Пример #4
0
def count_and_save(j):
    p = os.path.join(output_path, "clueweb12_13B_pair",
                     "nli_prediction_{}".format(j))
    d = EstimatorPredictionViewer(p)
    r = count_contradiction(d)
    p = os.path.join(output_path, "clueweb12_13B_pair_summary_{}".format(j))
    pickle.dump(r, open(p, "wb"))
Пример #5
0
def combine_pc_rel_with_cpid(prediction_file, info: Dict) \
        -> Dict[DataID, Tuple[CPIDPair, Logits, Logits]]:
    data = EstimatorPredictionViewer(prediction_file)
    print("Num data ", data.data_len)
    out_d: Dict[DataID, Tuple[CPIDPair, Logits, Logits]] = {}
    num_append = 0
    last_claim = None
    prev_data_id = None
    ticker = TimeEstimator(data.data_len)
    for entry in data:
        ticker.tick()
        logits = entry.get_vector("logits")
        data_id = entry.get_vector("data_id")[0]
        try:
            cur_info = info[data_id]
            if 'cid' in cur_info:
                cid = cur_info['cid']
                last_claim = cid, logits
                prev_data_id = data_id
            elif 'pid' in cur_info:
                pid = cur_info['pid']
                cid, c_logits = last_claim
                cpid = CPIDPair((cid, pid))
                out_d[data_id] = (cpid, c_logits, logits)
                out_d[prev_data_id] = (cpid, c_logits, logits)
                num_append += 1
            else:
                assert False
        except KeyError as e:
            print(e)
            pass
    return out_d
Пример #6
0
def save_ranked_list(prediction_path, meta_info, save_path):

    data = EstimatorPredictionViewer(prediction_path)

    q_dict = {}
    for entry in data:
        data_id = entry.get_vector('data_id')[0]
        scores = entry.get_vector('logits')
        q_id, doc_id = meta_info[data_id]

        if q_id not in q_dict:
            q_dict[q_id] = []

        probs = softmax(scores)
        q_dict[q_id].append((doc_id, probs[1]))

    def add_rank(
            ranked_list: List[Tuple[str,
                                    float]]) -> List[Tuple[str, int, float]]:
        ranked_list.sort(key=lambda x: x[1], reverse=True)
        ranked_list = [(doc_id, rank, score)
                       for rank, (doc_id, score) in enumerate(ranked_list)]
        return ranked_list

    q_dict_new = dict_value_map(add_rank, q_dict)
    write_ranked_list_from_d(q_dict_new, save_path)
Пример #7
0
def load_prediction(pred_path) -> List[Tuple[str, List[np.ndarray]]]:
    data = EstimatorPredictionViewer(pred_path)

    def parse_entry(entry) -> Tuple[str, np.ndarray]:
        input_tokens: Segment = entry.get_tokens('input_ids')
        logits = entry.get_vector("logits")
        probs = softmax(logits)
        key = input_tokens_to_key(input_tokens)
        return key, probs

    parsed_data: List[Tuple[str, np.ndarray]] = lmap(parse_entry, data)

    keys: List[str] = unique_from_sorted(left(parsed_data))
    grouped: Dict[str,
                  List[Tuple[str,
                             np.ndarray]]] = group_by(parsed_data,
                                                      lambda x: x[0])

    def fetch_scores(key):
        l = []
        for k2, score in grouped[key]:
            assert key == k2
            l.append(score)
        return key, l

    results: List[Tuple[str, List[np.ndarray]]] = lmap(fetch_scores, keys)
    return results
Пример #8
0
def collect_scores(prediction_file, info: Dict, logit_to_score) \
        -> Dict[DataID, Tuple[CPIDPair, float]]:
    data = EstimatorPredictionViewer(prediction_file)
    print("Num data ", data.data_len)
    out_d: Dict[DataID, Tuple[CPIDPair, float]] = {}
    for entry in data:
        logits = entry.get_vector("logits")
        score = logit_to_score(logits)
        data_id = entry.get_vector("data_id")[0]
        try:
            cur_info = info[str(data_id)]

            if 'kdp' in cur_info:
                parse_info_inner(cur_info, qck_convert_map, True)
                cid = int(cur_info['query'].query_id)
                pid = int(cur_info['candidate'].id)
            elif 'query' in cur_info:
                parse_info_inner(cur_info, qc_convert_map, True)
                cid = int(cur_info['query'].query_id)
                pid = int(cur_info['candidate'].id)
            else:
                cid = cur_info['cid']
                pid = cur_info['pid']
            cpid = CPIDPair((cid, pid))
            out_d[data_id] = (cpid, score)
        except KeyError as e:
            print("Key error", e)
            print("data_id", data_id)
            pass
    return out_d
Пример #9
0
def collect_pc_rel_score(prediction_file, info: Dict):
    data = EstimatorPredictionViewer(prediction_file)

    print("Num data ", data.data_len)
    group_by_key = {}
    num_append = 0
    last_claim = None
    ticker = TimeEstimator(data.data_len)
    for entry in data:
        ticker.tick()
        logits = entry.get_vector("logits")
        data_id = entry.get_vector("data_id")[0]
        try:
            cur_info = info[data_id]
            if 'cid' in cur_info:
                cid = cur_info['cid']
                last_claim = cid, logits
            elif 'pid' in cur_info:
                pid = cur_info['pid']
                cid, c_logits = last_claim
                key = cid, pid
                if key not in group_by_key:
                    group_by_key[key] = []
                group_by_key[key].append((c_logits, logits))
                num_append += 1
            else:
                assert False
        except KeyError as e:
            print(e)
            pass
    print(num_append)
    return group_by_key
Пример #10
0
def eval(file_name):
    data = EstimatorPredictionViewer(file_name)
    for entry in data:
        entry.get_vector("masked_label_ids_label")
        entry.get_vector("is_test_inst")
        print(entry.get_vector("is_test_inst"),
              entry.get_vector("masked_lm_example_loss_label"))
Пример #11
0
def combine_prediction_list(path_list):
    data_list = []
    data_len = 0
    for p in path_list:
        d = EstimatorPredictionViewer(p)
        data_len += d.data_len
        data_list.append(d)
    data = iter_fn(data_list)
    return data, data_len
Пример #12
0
def collect_by_order(input_file, feature_data: List[PerspectiveCandidate]):
    predictions = EstimatorPredictionViewer(input_file)

    print("prediction : {}".format(predictions.data_len))
    print("feature_data : {}".format(len(feature_data)))

    score_d: Dict[CPID, float] = {}
    for pred_entry, pc_candidate in zip(predictions, feature_data):
        logits = pred_entry.get_vector("logits")
        probs = softmax(logits)
        score = probs[1]

        cpid = CPID("{}_{}".format(pc_candidate.cid, pc_candidate.pid))
        score_d[cpid] = score

    return score_d
Пример #13
0
def pc_eval(pred_path: FilePath, label_path: FilePath, option="avg"):
    data = EstimatorPredictionViewer(pred_path)
    raw_predictions: List[Tuple[str, List[float]]] = load_prediction(data)
    keys, reduced_scores = reduce_score(raw_predictions, option)

    predictions = zip(keys, reduced_scores)
    labels_d: Dict[str, int] = load_label(label_path)

    label_list: List[int] = lmap(lambda x: labels_d[x], keys)

    pred = lmap(lambda x: int(x > 0.5), reduced_scores)
    print(reduced_scores)
    print(pred)

    num_correct = np.count_nonzero(np.equal(pred, label_list))
    print("Acc : ", num_correct / len(label_list))
Пример #14
0
def load_prediction(pred_path) -> Dict[str, List[Tuple[str, float, Segment]]]:
    data = EstimatorPredictionViewer(pred_path)

    def parse_entry(entry) -> Tuple[str, float, Segment]:
        input_tokens: Segment = entry.get_tokens('input_ids')
        logits = entry.get_vector("logits")
        probs = softmax(logits)
        key = input_tokens_to_key(input_tokens)
        score = probs[1]

        return key, score, input_tokens

    parsed_data: List[Tuple[str, float, Segment]] = lmap(parse_entry, data)
    grouped: Dict[str, List[Tuple[str, float, Segment]]] = group_by(parsed_data, lambda x: x[0])

    return grouped
Пример #15
0
def main():
    data = EstimatorPredictionViewer(sys.argv[1])
    rows = []
    for e in data:
        label_ids = e.get_vector("label_ids")
        logits = e.get_vector("logits")
        probs = softmax(logits, -1)

        predict_label = 1 if probs[1] > 0.5 else 0

        decision = "Y" if predict_label == label_ids else "N"

        row = [label_ids, probs[1], decision]
        rows.append(row)

    rows.sort(key=lambda x: x[1], reverse=True)
    print_table(rows)
Пример #16
0
def main():
    prediction_file = sys.argv[1]
    pred_data = EstimatorPredictionViewer(prediction_file)

    def bin_fn(score):
        return str(int(score * 100))

    bin = BinHistogram(bin_fn)
    for idx, e in enumerate(pred_data):
        score = logit_to_score_softmax(e.get_vector('logits'))
        bin.add(score)

    print(bin.counter.keys())
    for i in range(101):
        key = str(i)
        if key in bin.counter:
            print(key, bin.counter[key])
Пример #17
0
def main():
    file_path = sys.argv[1]
    name = os.path.basename(file_path)
    viewer = EstimatorPredictionViewer(file_path)
    html = HtmlVisualizer("toke_score_gold.html")
    stopwords = load_stopwords_for_query()

    skip = 10
    for entry_idx, entry in enumerate(viewer):
        if entry_idx % skip != 0:
            continue
        tokens = entry.get_tokens("input_ids")
        input_ids = entry.get_vector("input_ids")
        label_ids = entry.get_vector("label_ids")
        label_ids = np.reshape(label_ids, [-1, 2])
        log_label_ids = np.log(label_ids + 1e-10)
        seg1, seg2 = split_p_h_with_input_ids(tokens, input_ids)

        pad_idx = tokens.index("[PAD]")
        assert pad_idx > 0

        logits = entry.get_vector("logits")
        cells = []
        cells2 = []
        for idx in range(pad_idx):
            probs = label_ids[idx]
            token = tokens[idx]

            score = probs[0]
            color = "B" if score > 0 else "R"
            highlight_score = min(abs(score) * 10000, 100)
            if token in stopwords:
                highlight_score = 0
            if token in seg1:
                highlight_score = 50
                color = "G"

            c = Cell(token,
                     highlight_score=highlight_score,
                     target_color=color)
            cells.append(c)
        html.multirow_print_from_cells_list([cells, cells2])

        if entry_idx > 10000:
            break
Пример #18
0
def aawd_pred_histogram():
    prediction_file = at_output_dir("clue_counter_arg",
                                    "ada_argu3_aawd_20000.score")
    prediction_file = at_output_dir("clue_counter_arg",
                                    "ada_aawd5_clue.4000.score")
    pred_data = EstimatorPredictionViewer(prediction_file)

    def bin_fn(score):
        return str(int(score * 1000))

    bin = BinHistogram(bin_fn)
    for idx, e in enumerate(pred_data):
        score = logit_to_score_softmax(e.get_vector('logits'))
        bin.add(score)

    for i in range(101):
        key = str(i)
        if key in bin.counter:
            print(key, bin.counter[key])
Пример #19
0
def collect_scores_and_confidence(prediction_file, info: Dict, logit_to_score) \
        -> Dict[DataID, Tuple[CPIDPair, float, float]]:
    data = EstimatorPredictionViewer(prediction_file)
    print("Num data ", data.data_len)
    out_d: Dict[DataID, Tuple[CPIDPair, float, float]] = {}
    for entry in data:
        logits = entry.get_vector("logits")
        score = logit_to_score(logits)
        data_id = entry.get_vector("data_id")[0]
        confidence = get_confidence_or_rel_score(entry)
        try:
            cur_info = info[str(data_id)]
            cid = cur_info['cid']
            pid = cur_info['pid']
            cpid = CPIDPair((cid, pid))
            out_d[data_id] = (cpid, score, confidence)
        except KeyError as e:
            print("Key error")
            print("data_id", data_id)
            pass
    return out_d
Пример #20
0
def get_cpid_score(pred_path: FilePath,
                   cpid_resolute_d: Dict[str, CPID],
                   option="avg") -> Dict[CPID, float]:

    data = EstimatorPredictionViewer(pred_path)
    raw_predictions: List[Tuple[str, List[float]]] = load_prediction(data)
    keys, reduced_scores = reduce_score(raw_predictions, option)

    cpid_list = []
    n_not_found = 0
    for x in keys:
        try:
            cpid_list.append(cpid_resolute_d[x])
        except KeyError as e:
            print("not found", x)
            n_not_found += 1

    if n_not_found:
        print("{} missing from text -> cpid resolution".format(n_not_found))
    # cpid_list = lmap(lambda x: cpid_resolute_d[x], keys)
    return dict(zip(cpid_list, reduced_scores))
Пример #21
0
def show():
    p = os.path.join(output_path, "pair_eval_1")
    d = EstimatorPredictionViewer(p)

    pred0 = load_from_pickle("cont_model_0")

    labels = get_label_as_or()
    entries = []
    entries2 = []
    for idx, entry in enumerate(d):
        logits = entry.get_vector("logits")
        probs = softmax(logits)
        pred = np.argmax(probs)
        entries.append(probs[2])
        entries2.append(random.random())
        #print(idx, pred==labels[idx], pred, labels[idx], probs)

    labels_binary = list([t == 2 for t in labels])
    print(roc_auc_score(labels, entries))
    print(roc_auc_score(labels, entries2))
    print(roc_auc_score(labels, pred0))
Пример #22
0
def collect_info(prediction_file, info: Dict, logit_to_score) -> Dict[CPIDPair, List[Tuple[float, float, Dict]]]:
    data = EstimatorPredictionViewer(prediction_file)
    print("Num data ", data.data_len)
    out_d: Dict[CPIDPair, List[Tuple[float, float, Dict]]] = defaultdict(list)


    for entry in data:
        logits = entry.get_vector("logits")
        score = logit_to_score(logits)
        rel_score = entry.get_vector("rel_score")[0]
        data_id = entry.get_vector("data_id")[0]
        try:
            cur_info = info[str(data_id)]
            cid = cur_info['cid']
            pid = cur_info['pid']
            cpid = CPIDPair((cid, pid))
            out_d[cpid].append((score, rel_score, cur_info))
        except KeyError as e:
            print("Key error")
            print("data_id", data_id)
            pass
    return out_d
Пример #23
0
def collect_pipeline2_score(prediction_path,
                            pc_rel_info) -> Dict[CPID, List[float]]:
    info_d = load_from_pickle(pc_rel_info)
    print('info_d', len(info_d))

    def get_cpid(data_id, info_d) -> CPID:
        try:
            info_1 = info_d[data_id - 1]
            info_2 = info_d[data_id]
            cid = info_1['cid']
            pid = info_2['pid']
        except KeyError:
            info_1 = info_d[data_id]
            info_2 = info_d[data_id + 1]
            cid = info_1['cid']
            pid = info_2['pid']

        return CPID("{}_{}".format(cid, pid))

    data = EstimatorPredictionViewer(prediction_path)

    print("Num data ", data.data_len)
    ticker = TimeEstimator(data.data_len)
    score_list_d: Dict[CPID, List] = {}
    for entry in data:
        ticker.tick()
        logits = entry.get_vector("logits")
        probs = softmax(logits)
        score = probs[1]
        data_id = entry.get_vector("data_id")[0]

        cpid: CPID = get_cpid(data_id, info_d)

        if cpid not in score_list_d:
            score_list_d[cpid] = []
        score_list_d[cpid].append(score)

    return score_list_d
Пример #24
0
def main(pred_path, info_path, output_path):
    pred_data = EstimatorPredictionViewer(pred_path)
    info = load_from_pickle(info_path)
    prediction_to_ranked_list(pred_data, info, output_path)
Пример #25
0
def dir_itr(dir_path):
    for file_path in get_dir_files(dir_path):
        data = EstimatorPredictionViewer(file_path)
        for entry in data:
            yield entry
Пример #26
0
def load_preditions(path):
    data = EstimatorPredictionViewer(path)
    for entry in data:
        logits = entry.get_vector("logits")
        yield logits
Пример #27
0
def load_preditions(path):
    data = EstimatorPredictionViewer(path)
    for entry in data:
        input_ids = entry.get_vector("input_ids")
        logits = entry.get_vector("logits")
        yield input_ids, logits