def make_distances(matches): """ returns the matches with additional distance columns """ df = matches.copy() df[DISTANCE_GEOMETRIC] = df.apply(lambda x: sd.euclidean( [x[MAN_START], x[MAN_END]], [x[AUTO_START], x[AUTO_END]]), axis=1) df[DISTANCE_EDIT] = df.apply( lambda x: distance.edit_distance(x[MAN_TEXT], x[AUTO_TEXT]), axis=1) df[DISTANCE_HAMMING] = df.apply( lambda x: segmentation.ghd(*to_segmentation_metric_form( [x[MAN_START], x[MAN_END]], [x[AUTO_START], x[AUTO_END]])), axis=1) # windowfill k will be 1/2 average segment length df[DISTANCE_Windowdiff] = df.apply( lambda x: segmentation.windowdiff(*to_segmentation_metric_form( [x[MAN_START], x[MAN_END]], [x[AUTO_START], x[AUTO_END]], return_window_size=True)), axis=1) df[DISTANCE_PK] = df.apply( lambda x: segmentation.pk(*to_segmentation_metric_form( [x[MAN_START], x[MAN_END]], [x[AUTO_START], x[AUTO_END]])), axis=1) return df
def get_standard_metrics(gt, pred, msn): gt_segs = ''.join(['1' if i in gt else '0' for i in range(msn)]) pred_segs = ''.join(['1' if i in pred else '0' for i in range(msn)]) k_val = int(round(len(gt_segs) / (gt_segs.count('1') * 2.0))) k_val = k_val // 4 return seg.pk(gt_segs, pred_segs, k=k_val), seg.windowdiff(gt_segs, pred_segs, k=k_val)
def compute_segmentation_scores(reference, results, k): """ Compute WindowDiff, Beeferman's Pk and Generalized Hamming Distance """ window_diff = float(windowdiff(reference, results, k, boundary="T")) / len(reference) bpk = pk(reference, results, boundary="T") generalized_hamming_distance = ghd(reference, results, boundary="T") / len(reference) return window_diff, bpk, generalized_hamming_distance
def get_seg_scores(y_test, pred_class, evalk=3): targetstr = "".join([str(int(x)) for x in y_test]) predstr = "".join([str(int(x)) for x in pred_class]) logger.debug(targetstr[0:50]) logger.debug(predstr[0:50]) wd = windowdiff(targetstr, predstr, k=evalk) pkval = pk(targetstr, predstr, k=evalk) print "PK: %f" % pkval print "WD: %f" % wd return pkval, wd
def get_pk_wd(x, k=3, conv=None): # logger.debug("evalk %f: " % evalk) if conv == None: conv = x["conv"].iloc[0] targets = "0" * k + "".join(x["target"].astype(str)) + "0" * k preds = "0" * k + "".join(x["pred"].astype(str)) + "0" * k wd = windowdiff(targets, preds, k=k) pkval = pk(targets, preds, k=k) # print "PK: %f" % pkval # print "WD: %f" % wd return pd.DataFrame({"conv": conv, "PK": pkval, "WD": wd}, index=[conv])
def score(predicts, labels, windowsize=2, type=1): ''' sample_num * conversation_length list of numpy arrays :param predicts: :param masks: :param labels: :param type: 1 -- origianl dataset, the windowsize is appropriate; 0 -- augmented datset, the windowsize may be wrong :return: windowdiff pk F1-macro ''' acc = 0 f1_macro = 0 f1_micro = 0 windiff = 0 pkk = 0 for i in range(len(predicts)): predict_str = ''.join(str(x) for x in list(predicts[i])) label_str = ''.join(str(x) for x in list(labels[i])) acc += np.sum(np.equal(predicts[i], labels[i])) / len(predicts[i]) f1_macro += f1_score(labels[i], predicts[i], average='macro') f1_micro += f1_score(labels[i], predicts[i], average='micro') if type: windiff += windowdiff(label_str, predict_str, windowsize) pkk += pk(label_str, predict_str, windowsize) acc = acc / len(predicts) f1_macro = f1_macro / len(predicts) f1_micro = f1_micro / len(predicts) if type: windiff = windiff / len(predicts) pkk = pkk / len(predicts) score = { "windowdiff": windiff, "pk": pkk, "F1-macro": f1_macro, "acc": acc } return score
def evaluate(gold_idx, pred_idx, k): """ gold_idx: golden standart of segmentation in the following format: list of lists of indexes pred_idx: predicted segmentation of the text in the following format: list of lists of indexes k: window size (preferrably half of the document length divided by the number of gold segments) return: pk (Beeferman D., Berger A., Lafferty J. (1999)) and windowdiff (Pevzner, L., and Hearst, M (2002)) metrics for the prediction (less the better) """ gold_idx = [[str(0) for i in j] for j in gold_idx] gold = [] for i in gold_idx: i[-1] = "1" gold.extend(i) gold = "".join(gold) pred_idx = [[str(0) for i in j] for j in pred_idx] pred = [] for i in pred_idx: i[-1] = "1" pred.extend(i) pred = "".join(pred) return {'pk': pk(gold, pred, k), 'windowdiff': windowdiff(gold, pred, k)}
def evaluate_segmentation(bc3=False, limit=0): g = data_to_string(WAPITI_GOLD_FILE, limit=limit) # gold string r = data_to_string(WAPITI_RESULT_FILE, limit=limit) # result string if bc3: t = data_to_string(BC3_TEXT_TILING_FILE, limit=limit, label_position=0) # text tiling baseline string else: t = data_to_string(WAPITI_GOLD_FILE, limit=limit, label_position=-2) avg = float(len(g)) / (g.count("T") + 1) # average segment size k = int(avg / 2) # window size for WindowDiff b = ("T" + (int(math.floor(avg)) - 1) * ".") * int( math.ceil(float(len(g)) / int(math.floor(avg)))) b = b[:len(g)] # baseline string print(g[:150]) print(r[:150]) # WindowDiff wdi = (float(windowdiff(g, r, k, boundary="T")) / len(g)) * 100 # Beeferman's Pk bpk = (pk(g, r, boundary="T")) * 100 # Generalized Hamming Distance ghd = (GHD(g, r, boundary="T") / len(g)) * 100 # accuracy acc = accuracy(list(g), list(r)) * 100 # precision, recall, f-measure pre = metrics.precision_score(list(g), list(r)) * 100 rec = metrics.recall_score(list(g), list(r)) * 100 f_1 = (2.0 * (rec_rs * pre_rs)) / (rec_rs + pre_rs) return acc, pre, rec, f_1, wdi, bpk, ghd, g.count("T"), r.count("T")
def evaluate_segmentation(bc3=False, limit=0): g = data_to_string(WAPITI_GOLD_FILE, limit=limit) # gold string r = data_to_string(WAPITI_RESULT_FILE, limit=limit) # result string if bc3: t = data_to_string(BC3_TEXT_TILING_FILE, limit=limit, label_position=0) # text tiling baseline string else: t = data_to_string(WAPITI_GOLD_FILE, limit=limit, label_position=-2) avg = float(len(g)) / (g.count("T") + 1) # average segment size k = int(avg / 2) # window size for WindowDiff b = ("T" + (int(math.floor(avg)) - 1) * ".") * int(math.ceil(float(len(g)) / int(math.floor(avg)))) b = b[:len(g)] # baseline string print(g[:150]) print(r[:150]) # WindowDiff wdi = (float(windowdiff(g, r, k, boundary="T")) / len(g)) * 100 # Beeferman's Pk bpk = (pk(g, r, boundary="T")) * 100 # Generalized Hamming Distance ghd = (GHD(g, r, boundary="T") / len(g)) * 100 # accuracy acc = accuracy(list(g), list(r)) * 100 # precision, recall, f-measure pre = metrics.precision_score(list(g), list(r)) * 100 rec = metrics.recall_score(list(g), list(r)) * 100 f_1 = (2.0 * (rec_rs * pre_rs)) / (rec_rs + pre_rs) return acc, pre, rec, f_1, wdi, bpk, ghd, g.count("T"), r.count("T")
def evaluate_segmentation(bc3=False, limit=-1): d = "".join(data_to_list(WAPITI_TRAIN_FILE)) # training data g = "".join(data_to_list(WAPITI_GOLD_FILE, limit=limit)) # gold string temp_r = data_to_list(WAPITI_RESULT_FILE, limit=limit) # result string # n = data_to_list("var/union/ngrams_" + WAPITI_RESULT_FILE[-1], limit=limit) # scores = {} r = "" for i, col in enumerate(temp_r): # score = 0 # if n[i][:n[i].index("/")] == "T": # score = 1 # elif col[:col.index("/")] == "T": # score = float(col[col.index("/") + 1:]) # scores[i] = r += col[:col.index("/")] # sorted_indexes = sorted(scores, key=scores.get, reverse=True) # indexes = [index for index, score in scores.iteritems() if score > 0.99] # r = "." * len(g) # n_boundaries = int((float(g.count("T")) / len(g)) * len(g)) # for i, index in enumerate(sorted_indexes): # r = r[:index] + "T" + r[index + 1:] # if i == n_boundaries: # break # for index in indexes: # r = r[:index] + "T" + r[index+1:] if bc3: t = data_to_list(BC3_TEXT_TILING_FILE, limit=limit, label_position=0) # text tiling baseline string else: t = data_to_list(WAPITI_GOLD_FILE, limit=limit, label_position=-2) avg_g = float(len(g)) / (g.count("T") + 1) # average segment size (reference) avg_d = float(len(d)) / (d.count("T") + 1) # average segment size (training) k = int(avg_g / 2) # window size for WindowDiff b = ("T" + (int(math.floor(avg_d)) - 1) * ".") * int(math.ceil(float(len(d)) / int(math.floor(avg_d)))) b = b[:len(g)] # baseline string # WindowDiff wdi_rs = (float(windowdiff(g, r, k, boundary="T")) / len(g)) * 100 wdi_bl = (float(windowdiff(g, b, k, boundary="T")) / len(g)) * 100 wdi_tt = (float(windowdiff(g, t, k, boundary="T")) / len(g)) * 100 # Beeferman's Pk bpk_rs = (pk(g, r, boundary="T")) * 100 bpk_bl = (pk(g, b, boundary="T")) * 100 bpk_tt = (pk(g, t, boundary="T")) * 100 # Generalized Hamming Distance ghd_rs = (ghd(g, r, boundary="T") / len(g)) * 100 ghd_bl = (ghd(g, b, boundary="T") / len(g)) * 100 ghd_tt = (ghd(g, t, boundary="T") / len(g)) * 100 # accuracy acc_rs = accuracy(list(g), list(r)) * 100 acc_bl = accuracy(list(g), list(b)) * 100 acc_tt = accuracy(list(g), list(t)) * 100 # precision, recall, f-measure pre_rs = metrics.precision_score(list(g), list(r), pos_label="T") * 100 rec_rs = metrics.recall_score(list(g), list(r), pos_label="T") * 100 f_1_rs = (2.0 * (rec_rs * pre_rs)) / (rec_rs + pre_rs) pre_bl = metrics.precision_score(list(g), list(b), pos_label="T") * 100 rec_bl = metrics.recall_score(list(g), list(b), pos_label="T") * 100 f_1_bl = (2.0 * (rec_bl * pre_bl)) / (rec_bl + pre_bl) pre_tt = metrics.precision_score(list(g), list(t), pos_label="T") * 100 rec_tt = metrics.recall_score(list(g), list(t), pos_label="T") * 100 f_1_tt = (2.0 * (rec_tt * pre_tt)) / (rec_tt + pre_tt) return acc_rs, acc_bl, acc_tt, pre_rs, pre_bl, pre_tt, rec_rs, rec_bl, rec_tt, f_1_rs, f_1_bl, f_1_tt, wdi_rs, wdi_bl, wdi_tt, bpk_rs, bpk_bl, bpk_tt, ghd_rs, ghd_bl, ghd_tt, g.count("T"), b.count("T"), r.count("T"), t.count("T")
#print i, sline, prevdoc, currdoc if not prevdoc == None: doclens.append(currlen) prevdoc = currdoc currlen = 1 else: currlen += 1 #if i > 400: # break #print targets #print preds #print doclens logger.debug("ndocs: %d" % len(doclens)) evalk = int(round(numpy.average(doclens)/2)) logger.debug("evalk %f: " % evalk) wd = windowdiff(targets, preds, k=evalk) #logger.debug("WD: %f" % wd) pkval = pk(targets, preds, k=evalk) #logger.debug("PK: %f" % pkval) fstem = os.path.basename(options.input) with open(options.outfile, "w") as f: f.write(fstem + "\tPK\t" + str(pkval) + "\n") f.write(fstem + "\tWD\t" + str(wd) + "\n") print "PK: %f" % pkval print "WD: %f" % wd
if len(tt.startids) > 1: curr_doc_sizes = numpy.array(tt.startids[1:]) - numpy.array(tt.startids[:-1]) #evalk = int(round(numpy.average(curr_doc_sizes)/2)) evalk = 3 logger.debug("eval k: %d" % evalk) try: wd = windowdiff(goldstr, predstr, k=evalk) logger.info("WD: %f", wd) wds.append(wd) except ValueError as e: logger.error("windowdiff value error") logger.error(e) try: pkval = pk(goldstr, predstr, k=evalk) logger.info("PK: %f", pkval) pks.append(pkval) except ValueError as e: logger.error("pkval value error") logger.error(e) pseq = pseq + predstr gseq = gseq + goldstr docsizes = numpy.hstack((docsizes, curr_doc_sizes)) else: logger.info("only one paragraph: " + fstem) if 'convid' in tt.meta[0].keys(): print tt.meta[0]['convid']