def save_dev_scores(model, val, val_x): val_probs = model.predict_proba(val_x) score_d = {} for pc_vector_feature, prediction in zip(val, val_probs): cid = pc_vector_feature.claim_pers.cid pid = pc_vector_feature.claim_pers.pid cpid = CPID("{}_{}".format(cid, pid)) score_d[cpid] = prediction[1] save_to_pickle(score_d, "pc_ngram_logits")
def prediction_to_dict( prediction: List[Tuple[str, List[Dict]]]) -> Dict[CPID, float]: output: Dict[CPID, float] = {} for claim_id, preds in prediction: for pred in preds: cpid = CPID("{}_{}".format(claim_id, pred['pid'])) score = pred['score'] output[cpid] = float(score) return output
def get_cpids_and_token_keys( tokenizer: FullTokenizer, claim_entry: ParagraphClaimPersFeature) -> Tuple[str, CPID]: claim_text = claim_entry.claim_pers.claim_text claim_tokens = tokenizer.tokenize(claim_text) p_text = claim_entry.claim_pers.p_text p_tokens = tokenizer.tokenize(p_text) key = " ".join(claim_tokens) + "_" + " ".join(p_tokens) cpid: CPID = CPID("{}_{}".format(claim_entry.claim_pers.cid, claim_entry.claim_pers.pid)) return key, cpid
def get_cpid(data_id, info_d) -> CPID: try: info_1 = info_d[data_id - 1] info_2 = info_d[data_id] cid = info_1['cid'] pid = info_2['pid'] except KeyError: info_1 = info_d[data_id] info_2 = info_d[data_id + 1] cid = info_1['cid'] pid = info_2['pid'] return CPID("{}_{}".format(cid, pid))
def collect_by_order(input_file, feature_data: List[PerspectiveCandidate]): predictions = EstimatorPredictionViewer(input_file) print("prediction : {}".format(predictions.data_len)) print("feature_data : {}".format(len(feature_data))) score_d: Dict[CPID, float] = {} for pred_entry, pc_candidate in zip(predictions, feature_data): logits = pred_entry.get_vector("logits") probs = softmax(logits) score = probs[1] cpid = CPID("{}_{}".format(pc_candidate.cid, pc_candidate.pid)) score_d[cpid] = score return score_d
def get_predictions( claim_and_candidate: Tuple[Dict, List[Dict]]) -> Tuple[str, List[Dict]]: claim_info, candidates = claim_and_candidate nonlocal dp_not_found for candi in candidates: cid = candi['cid'] pid = candi['pid'] cpid = CPID("{}_{}".format(cid, pid)) if cpid in score_d: candi['new_score'] = score_d[cpid] else: dp_not_found += 1 candi['new_score'] = 0.01 candi['final_score'] = candi['new_score'] + candi['score'] / 100 candi[ 'rationale'] = "final_score={} cls_score={} lucene_score={}".format( candi['final_score'], candi['new_score'], candi['score']) candidates.sort(key=lambda c: c['final_score'], reverse=True) return claim_info['cId'], candidates[:top_k]
def CPID_to_CPIDPair(cpid: CPID) -> CPIDPair: cid, pid = cpid.split("_") return CPIDPair((int(cid), int(pid)))