def get_similarity(text_1, text_2): #text_1 and text_2 are xml data that uses spans to seperate boundaries #e.g. BOSTON, MA ... <span class="highlighted" id="634541">Steven L. #Davis pled guilty yesterday to federal charges that he stole and disclosed trade secrets of The Gillette Company</span>. if text_1 == '' or text_2 == '': return 'Error Text Input Is Empty' else: xml_soup_1 = BeautifulSoup(text_1) xml_soup_2 = BeautifulSoup(text_2) xml_soup_1 = remove_html_tags(xml_soup_1) xml_soup_2 = remove_html_tags(xml_soup_2) segements_1 = get_segements(xml_soup_1) segements_2 = get_segements(xml_soup_2) seg_check = check_segment_length(segements_1, segements_2) if not seg_check: return 'Error Source Text Was Different' masses_1 = segeval.convert_positions_to_masses(segements_1) masses_2 = segeval.convert_positions_to_masses(segements_2) ss = segeval.segmentation_similarity(masses_1, masses_2) ss = float(ss) pk = segeval.pk(masses_1, masses_2) pk = 1 - float(pk) win_diff = segeval.window_diff(masses_1, masses_2) win_diff = 1 - float(win_diff) return ss, pk, win_diff
def __initialization(reference, hypothesis): if (len(reference) != len(hypothesis)): print( "Error! The length of hypothesis doesn't match the length of reference!" ) raise SystemExit # Initializing the format of the reference and hypothesis sequences for feeding in the SegEval reference_boundary = segeval.convert_positions_to_masses(reference) hypothesis_boundary = segeval.convert_positions_to_masses(hypothesis) return reference_boundary, hypothesis_boundary
def seg_eval(algo_group_vec, real_group_vec, rev=True): """ A function computing the Pk and win_diff value for 2 segmentations. Also give random baselines :param algo_group_vec: The algorithm result in the form a token group memberships :type algo_group_vec: Union[list, numpy.ndarray] :param real_group_vec: The real group memberships of tokens :type real_group_vec: Union[list, numpy.ndarray] :return: Pk value, Win_diff value, Pk random value, Win_diff random value :rtype: (float, float, float, float) """ # Transform into segmentation vectors real_segm_vec = convert_positions_to_masses(real_group_vec) algo_segm_vec = convert_positions_to_masses(algo_group_vec) # Make a shuffle group vec rdm_group_vec = real_group_vec.copy() rdm.shuffle(rdm_group_vec) rdm_segm_vec = convert_positions_to_masses(rdm_group_vec) if rev: # Compute the real value pk_res = pk(real_segm_vec, algo_segm_vec) try: win_diff = window_diff(real_segm_vec, algo_segm_vec) except: win_diff = 1 # Compute the random value pk_rdm = pk(real_segm_vec, rdm_segm_vec) try: win_diff_rdm = window_diff(real_segm_vec, rdm_segm_vec) except: win_diff_rdm = 1 else: # Compute the real value pk_res = pk(algo_segm_vec, real_segm_vec) try: win_diff = window_diff(algo_segm_vec, real_segm_vec) except: win_diff = 1 # Compute the random value pk_rdm = pk(rdm_segm_vec, real_segm_vec) try: win_diff_rdm = window_diff(rdm_segm_vec, real_segm_vec) except: win_diff_rdm = 1 # Return return pk_res, win_diff, pk_rdm, win_diff_rdm
def test_convert_positions_to_masses(self): ''' Test convert_positions_to_masses. ''' self.assertEquals( (4, 2), convert_positions_to_masses([1, 1, 1, 1, 2, 2]))
import segeval ground_truth_file_list = [ "mix_word1_groups.txt", "mix_word5_groups.txt", "mix_sent1_groups.txt", "mix_sent5_groups.txt", "61320_199211_pp_groups.txt", "61320_200411_pp_groups.txt", "61320_201211_pp_groups.txt", "61320_201611_pp_groups.txt", "61620_200411_pp_groups.txt", "61620_200811_pp_groups.txt", "61620_201211_pp_groups.txt", "61620_201611_pp_groups.txt" ] for ground_truth_file in ground_truth_file_list: # Getting the base path (must run the script from a folder inside the "SemSim_Autocor" folder) working_path = os.getcwd() base_path = str.split(working_path, "SemSim_AutoCor")[0] + "SemSim_AutoCor/" # Path of the raw text file ground_truth_path = f"{base_path}corpora/{ground_truth_file}" # Loading ground truth with open(ground_truth_path) as ground_truth: real_group_vec = ground_truth.read() real_group_vec = np.array( [int(element) for element in real_group_vec.split(",")]) n_group = max(real_group_vec) + 1 real_segm_vec = segeval.convert_positions_to_masses(real_group_vec) print( f"Groupfile {ground_truth_file} has {n_group} groups and mean segment length of {np.mean(real_segm_vec)}" )
from nltk.metrics.segmentation import pk, windowdiff import segeval as se import horae as ho import codecs if __name__ == '__main__': test = sys.argv[1] classifier = sys.argv[2] type_ = sys.argv[3] level = sys.argv[4] path_pred = "../data/test/seg/" + test + "_" + level + ".pred_" +\ classifier path_ref = "../data/test/choiformat/" + type_ + "/" + test + "_" +\ level + ".ref" ref, nbref1, refs = ho.load_text(path_ref) pred, nbpred1, preds = ho.load_text(path_pred) d = {"stargazer": {"1": refs, "2": preds}} seg1 = d['stargazer']['1'] seg2 = d['stargazer']['2'] segs1 = se.convert_positions_to_masses(seg1) segs2 = se.convert_positions_to_masses(seg2) print("pk\tWindowdiff: \n") print(str(round(se.pk(segs2, segs1), 4)) + "\t" + str(round(se.window_diff(segs2, segs1), 4)))
rstr_best_real_group_vec = np.delete(best_real_group_vec, indices_for_known_label) # Compute nmi score nmi = normalized_mutual_info_score(rstr_real_group_vec, rstr_algo_group_vec) # Compute Map ap_vector = [ average_precision_score(rstr_best_real_group_vec == group_id, rstr_algo_group_vec == group_id) for group_id in range(1, max(rstr_real_group_vec) + 1) ] map = np.mean(ap_vector) # Segmentation evaluation real_segm_vec = convert_positions_to_masses(rstr_real_group_vec) algo_segm_vec = convert_positions_to_masses(rstr_algo_group_vec) rdm_group_vec = rstr_real_group_vec.copy() rdm.shuffle(rdm_group_vec) rdm_segm_vec = convert_positions_to_masses(rdm_group_vec) pk_res = pk(algo_segm_vec, real_segm_vec) win_diff = window_diff(algo_segm_vec, real_segm_vec) pk_rdm = pk(rdm_segm_vec, real_segm_vec) win_diff_rdm = window_diff(rdm_segm_vec, real_segm_vec) # Compute the aggregate labels df_results = pd.DataFrame(result_matrix) df_results["Token"] = token_list type_results = df_results.groupby("Token").mean() type_list = list(type_results.index) type_values = type_results.to_numpy()
sent_list = txt_f.readlines() # Make the whole text text_string = " ".join(sent_list) # Split by tokens token_list = nltk.word_tokenize(text_string) # Vocabulary of text vocab_text = set(token_list) # Get the groups with open(f"{input_text_folder}/{group_file_list[i]}", "r") as grp_f: token_group_vec = grp_f.read() token_group_vec = np.array( [int(element) for element in token_group_vec.split(",")]) n_groups = len(set(token_group_vec)) token_segm_vec = segeval.convert_positions_to_masses(token_group_vec) # Make groups by sentences sent_group_vec = [] ind_1 = 0 for sent in sent_list: sent_token = nltk.word_tokenize(sent) token_group = list(token_group_vec[ind_1:(ind_1 + len(sent_token))]) sent_group_vec.append(int(max(set(token_group), key=token_group.count))) ind_1 = ind_1 + len(sent_token) sent_group_vec = np.array(sent_group_vec) sent_segm_vec = segeval.convert_positions_to_masses(sent_group_vec) # Write results with open(output_file, "a") as output: