def spearman_rho(worder, normalize=True): """ Calculates the Spearman's Rho correlation coefficient given the *worder* list of word alignment from word_rank_alignment(), using the formula: rho = 1 - sum(d**2) / choose(len(worder)+1, 3) Given that d is the sum of difference between the *worder* list of indices and the original word indices from the reference sentence. Using the (H0,R0) and (H5, R5) example from the paper >>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5] >>> round(spearman_rho(worder, normalize=False), 3) -0.591 >>> round(spearman_rho(worder), 3) 0.205 :param worder: The worder list output from word_rank_alignment :param type: list(int) """ worder_len = len(worder) sum_d_square = sum((wi - i)**2 for wi, i in zip(worder, range(worder_len))) rho = 1 - sum_d_square / choose(worder_len + 1, 3) if normalize: # If normalized, the rho output falls between 0.0 to 1.0 return (rho + 1) / 2 else: # Otherwise, the rho outputs falls between -1.0 to +1.0 return rho
def spearman_rho(worder, normalize=True): """ Calculates the Spearman's Rho correlation coefficient given the *worder* list of word alignment from word_rank_alignment(), using the formula: rho = 1 - sum(d**2) / choose(len(worder)+1, 3) Given that d is the sum of difference between the *worder* list of indices and the original word indices from the reference sentence. Using the (H0,R0) and (H5, R5) example from the paper >>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5] >>> round(spearman_rho(worder, normalize=False), 3) -0.591 >>> round(spearman_rho(worder), 3) 0.205 :param worder: The worder list output from word_rank_alignment :param type: list(int) """ worder_len = len(worder) sum_d_square = sum((wi - i)**2 for wi, i in zip(worder, range(worder_len))) rho = 1 - sum_d_square / choose(worder_len+1, 3) if normalize: # If normalized, the rho output falls between 0.0 to 1.0 return (rho + 1) /2 else: # Otherwise, the rho outputs falls between -1.0 to +1.0 return rho
def kendall_tau(worder, normalize=True): """ Calculates the Kendall's Tau correlation coefficient given the *worder* list of word alignments from word_rank_alignment(), using the formula: tau = 2 * num_increasing_pairs / num_possible_pairs -1 Note that the no. of increasing pairs can be discontinuous in the *worder* list and each each increasing sequence can be tabulated as choose(len(seq), 2) no. of increasing pairs, e.g. >>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5] >>> number_possible_pairs = choose(len(worder), 2) >>> round(kendall_tau(worder, normalize=False),3) -0.236 >>> round(kendall_tau(worder),3) 0.382 :param worder: The worder list output from word_rank_alignment :type worder: list(int) :param normalize: Flag to indicate normalization to between 0.0 and 1.0. :type normalize: boolean :return: The Kendall's Tau correlation coefficient. :rtype: float """ worder_len = len(worder) # With worder_len < 2, `choose(worder_len, 2)` will be 0. # As we divide by this, it will give a ZeroDivisionError. # To avoid this, we can just return the lowest possible score. if worder_len < 2: tau = -1 else: # Extract the groups of increasing/monotonic sequences. increasing_sequences = find_increasing_sequences(worder) # Calculate no. of increasing_pairs in *worder* list. num_increasing_pairs = sum( choose(len(seq), 2) for seq in increasing_sequences) # Calculate no. of possible pairs. num_possible_pairs = choose(worder_len, 2) # Kendall's Tau computation. tau = 2 * num_increasing_pairs / num_possible_pairs - 1 if normalize: # If normalized, the tau output falls between 0.0 to 1.0 return (tau + 1) / 2 else: # Otherwise, the tau outputs falls between -1.0 to +1.0 return tau
def kendall_tau(worder, normalize=True): """ Calculates the Kendall's Tau correlation coefficient given the *worder* list of word alignments from word_rank_alignment(), using the formula: tau = 2 * num_increasing_pairs / num_possible pairs -1 Note that the no. of increasing pairs can be discontinuous in the *worder* list and each each increasing sequence can be tabulated as choose(len(seq), 2) no. of increasing pairs, e.g. >>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5] >>> number_possible_pairs = choose(len(worder), 2) >>> round(kendall_tau(worder, normalize=False),3) -0.236 >>> round(kendall_tau(worder),3) 0.382 :param worder: The worder list output from word_rank_alignment :type worder: list(int) :param normalize: Flag to indicate normalization :type normalize: boolean :return: The Kendall's Tau correlation coefficient. :rtype: float """ worder_len = len(worder) # Extract the groups of increasing/monotonic sequences. increasing_sequences = find_increasing_sequences(worder) # Calculate no. of increasing_pairs in *worder* list. num_increasing_pairs = sum(choose(len(seq),2) for seq in increasing_sequences) # Calculate no. of possible pairs. num_possible_pairs = choose(worder_len, 2) # Kendall's Tau computation. tau = 2 * num_increasing_pairs / num_possible_pairs -1 if normalize: # If normalized, the tau output falls between 0.0 to 1.0 return (tau + 1) /2 else: # Otherwise, the tau outputs falls between -1.0 to +1.0 return tau