def compute_cost_tensor(self, turns_list: List[List[Turn]]) -> np.ndarray: N = len(turns_list) k = int((N * (N - 1) / 2)) pairwise_costs = {} has_single_speaker = False for i, ref_turns in enumerate(turns_list): for j, sys_turns in enumerate(turns_list): if j <= i: continue cost = [] ref_groups = { key: list(group) for key, group in groupby(ref_turns, lambda x: x.speaker_id) } sys_groups = { key: list(group) for key, group in groupby(sys_turns, lambda x: x.speaker_id) } if len(ref_groups.keys()) == 1 or len(sys_groups.keys()) == 1: has_single_speaker = True for ref_spk_id in sorted(ref_groups.keys()): cur_row = [] ref_spk_turns = ref_groups[ref_spk_id] for sys_spk_id in sorted(sys_groups.keys()): sys_spk_turns = sys_groups[sys_spk_id] total_overlap = compute_spk_overlap( ref_spk_turns, sys_spk_turns) cur_row.append(-1 * total_overlap) cost.append(cur_row) new_axis = list(range(N)) new_axis.remove(i) new_axis.remove(j) # The expand_dims is for easy broadcasting pairwise_costs[(i, j)] = np.expand_dims( np.array(cost), axis=tuple(k for k in new_axis)) if has_single_speaker: # iterate and add since numpy cannot broadcast with 2 dummy dimensions vals = list(pairwise_costs.values()) cost_tensor = vals[0] for val in vals[1:]: cost_tensor = np.add(cost_tensor, val) else: # otherwise use broadcasting cost_tensor = np.sum(list(pairwise_costs.values())) return cost_tensor, pairwise_costs
def __map_hungarian(ref_turns: List[Turn], sys_turns: List[Turn]) -> Dict[Tuple[int, int], int]: """ Use Hungarian algorithm for label mapping for 2 system special case. """ cost_matrix = [] ref_groups = { key: list(group) for key, group in groupby(ref_turns, lambda x: x.speaker_id) } sys_groups = { key: list(group) for key, group in groupby(sys_turns, lambda x: x.speaker_id) } for ref_spk_id in sorted(ref_groups.keys()): cur_row = [] ref_spk_turns = ref_groups[ref_spk_id] for sys_spk_id in sorted(sys_groups.keys()): sys_spk_turns = sys_groups[sys_spk_id] total_overlap = _compute_spk_overlap(ref_spk_turns, sys_spk_turns) cur_row.append(-1 * total_overlap) cost_matrix.append(cur_row) cost_matrix = np.array(cost_matrix) row_ind, col_ind = linear_sum_assignment(cost_matrix) # Keep track of remaining row or col indices row_indices_remaining = list(range(len(cost_matrix))) col_indices_remaining = list(range(len(cost_matrix[0]))) label_mapping = {} for i in range(len(row_ind)): label_mapping[(0, row_ind[i])] = i row_indices_remaining.remove(row_ind[i]) label_mapping[(1, col_ind[i])] = i col_indices_remaining.remove(col_ind[i]) next_label = i + 1 # Assign labels to remaining row indices while len(row_indices_remaining) != 0: label_mapping[(0, row_indices_remaining[0])] = next_label next_label += 1 del row_indices_remaining[0] # Assign labels to remaining col indices while len(col_indices_remaining) != 0: label_mapping[(1, col_indices_remaining[0])] = next_label next_label += 1 del col_indices_remaining[0] return label_mapping
def __map_pair( self, ref_turns: List[Turn], sys_turns: List[Turn] ) -> Dict[Tuple[int, str], int]: ref_groups = { key: list(group) for key, group in groupby(ref_turns, lambda x: x.speaker_id) } sys_groups = { key: list(group) for key, group in groupby(sys_turns, lambda x: x.speaker_id) } ref_keys = sorted(ref_groups.keys()) sys_keys = sorted(sys_groups.keys()) M, N = len(ref_keys), len(sys_keys) cost_matrix = np.zeros((M, N)) for i, ref_spk_id in enumerate(ref_keys): cur_row = [] ref_spk_turns = ref_groups[ref_spk_id] for j, sys_spk_id in enumerate(sys_keys): sys_spk_turns = sys_groups[sys_spk_id] total_overlap = compute_spk_overlap(ref_spk_turns, sys_spk_turns) cost_matrix[i, j] = -1 * total_overlap row_ind, col_ind = linear_sum_assignment(cost_matrix) # Keep track of remaining row or col indices row_indices_remaining = list(range(M)) col_indices_remaining = list(range(N)) label_mapping = {} for i in range(len(row_ind)): label_mapping[(0, ref_keys[row_ind[i]])] = i row_indices_remaining.remove(row_ind[i]) label_mapping[(1, sys_keys[col_ind[i]])] = i col_indices_remaining.remove(col_ind[i]) next_label = i + 1 # Assign labels to remaining row indices while len(row_indices_remaining) != 0: label_mapping[(0, ref_keys[row_indices_remaining[0]])] = next_label next_label += 1 del row_indices_remaining[0] # Assign labels to remaining col indices while len(col_indices_remaining) != 0: label_mapping[(1, sys_keys[col_indices_remaining[0]])] = next_label next_label += 1 del col_indices_remaining[0] return label_mapping
def __validate_global_mapping(self) -> bool: for i, turns in enumerate(self.sorted_turns_list): groups = { key: list(group) for key, group in groupby(turns, lambda x: x.speaker_id) } for spk in groups: if (i, spk) not in self.global_mapping: return False return True
def main( input_rttms: List[click.Path], output_rttm: click.Path, uem_file: click.Path, channel: int, random_seed: int, **kwargs, # these are passed directly to combine_turns_list() method ) -> None: """Apply the DOVER-Lap algorithm on the input RTTM files.""" # Set random seeds globally random.seed(random_seed) np.random.seed(random_seed) # Load hypothesis speaker turns. info("Loading speaker turns from input RTTMs...", file=sys.stderr) turns_list = load_rttms(input_rttms) if uem_file is not None: info("Loading universal evaluation map...", file=sys.stderr) uem = load_uem(uem_file) # Trim turns to UEM scoring regions and merge any that overlap. info( "Trimming reference speaker turns to UEM scoring regions...", file=sys.stderr, ) turns_list = [trim_turns(turns, uem) for turns in turns_list] info("Merging overlapping speaker turns...", file=sys.stderr) turns_list = [merge_turns(turns) for turns in turns_list] file_to_turns_list = dict() for turns in turns_list: for fid, g in groupby(turns, lambda x: x.file_id): if fid in file_to_turns_list: file_to_turns_list[fid].append(list(g)) else: file_to_turns_list[fid] = [list(g)] # Run DOVER-Lap algorithm file_to_out_turns = dict() for file_id in file_to_turns_list: info("Processing file {}..".format(file_id), file=sys.stderr) turns_list = file_to_turns_list[file_id] random.shuffle( turns_list ) # We shuffle so that the hypothesis order is randomized file_to_out_turns[file_id] = DOVERLap.combine_turns_list( turns_list, file_id, **kwargs) # Write output RTTM file write_rttm(output_rttm, sum(list(file_to_out_turns.values()), []), channel=channel)
def get_speaker_keys( turns_list: List[List[Turn]]) -> Dict[Tuple[int, int], str]: """ Returns a dictionary which maps a file id (relative) and speaker id (relative) to absolute speaker id. """ speakers_dict = {} for i, turns in enumerate(turns_list): turn_groups = { key: list(group) for key, group in groupby(turns, lambda x: x.speaker_id) } for j, key in enumerate(sorted(turn_groups.keys())): speakers_dict[(i, j)] = key return speakers_dict
def get_mapped_turns_list( cls, turns_list: List[List[Turn]], file_id: str, method: Optional[str] = "greedy", sort_first: Optional[bool] = False, second_maximal: Optional[bool] = False, ) -> List[List[Turn]]: """ This function takes turns list from all RTTMs and applies an n-dimensional matching approximation algorithm to map all to a common label space. """ if (len(turns_list) == 2) or (method == "hungarian"): # We replace the original turns list with one sorted by average DER hungarian_map = HungarianMap(sort_first=sort_first) label_mapping, weights = hungarian_map.compute_mapping(turns_list) turns_list = hungarian_map.sorted_turns_list elif method == "greedy": greedy_map = GreedyMap(second_maximal=second_maximal) label_mapping, weights = greedy_map.compute_mapping(turns_list) # Get mapped speaker labels using the mapping mapped_turns_list = [] for i, turns in enumerate(turns_list): spk_groups = { key: list(group) for key, group in groupby(turns, lambda x: x.speaker_id) } mapped_turns = [] for spk_id in spk_groups.keys(): new_spk_id = label_mapping[(i, spk_id)] for turn in spk_groups[spk_id]: mapped_turns.append( Turn( turn.onset, turn.offset, speaker_id=new_spk_id, file_id=file_id, ) ) mapped_turns_list.append(mapped_turns) return mapped_turns_list, weights
def get_mapped_turns_list( cls, turns_list: List[List[Turn]], file_id: str, run_second_maximal: Optional[bool] = False) -> List[List[Turn]]: """ This function takes turns list from all RTTMs and applies an n-dimensional matching approximation algorithm to map all to a common label space. """ N = len(turns_list) # number of input hypotheses if N == 2: # if only 2 inputs need to be combined, we use the Hungarian algorithm # since it is provably optimal. Also, we assign both the systems # equal weight to prevent the voting to be dominated by one method. label_mapping = self.__map_hungarian(*turns_list) weights = np.array([0.5, 0.5]) else: k = int((N * (N - 1) / 2)) pairwise_costs = {} has_single_speaker = False for i, ref_turns in enumerate(turns_list): for j, sys_turns in enumerate(turns_list): if j <= i: continue cost = [] ref_groups = { key: list(group) for key, group in groupby(ref_turns, lambda x: x.speaker_id) } sys_groups = { key: list(group) for key, group in groupby(sys_turns, lambda x: x.speaker_id) } if len(ref_groups.keys()) == 1 or len( sys_groups.keys()) == 1: has_single_speaker = True for ref_spk_id in sorted(ref_groups.keys()): cur_row = [] ref_spk_turns = ref_groups[ref_spk_id] for sys_spk_id in sorted(sys_groups.keys()): sys_spk_turns = sys_groups[sys_spk_id] total_overlap = cls.__compute_spk_overlap( ref_spk_turns, sys_spk_turns) cur_row.append(-1 * total_overlap) cost.append(cur_row) new_axis = list(range(N)) new_axis.remove(i) new_axis.remove(j) # The expand_dims is for easy broadcasting pairwise_costs[(i, j)] = np.expand_dims( np.array(cost), axis=tuple(k for k in new_axis)) if has_single_speaker: # iterate and add since numpy cannot broadcast with 2 dummy dimensions vals = list(pairwise_costs.values()) cost_tensor = vals[0] for val in vals[1:]: cost_tensor = np.add(cost_tensor, val) else: # otherwise use broadcasting cost_tensor = np.sum(list(pairwise_costs.values())) # The weight of each hypothesis is computed by computing its total # overlap with all other hypotheses weights = np.array([0] * N, dtype=float) for i in range(N): cur_pairwise_costs = [ np.squeeze(x) for x in pairwise_costs.values() if x.shape[i] != 1 ] weights[i] = -1 * sum([np.sum(x) for x in cur_pairwise_costs]) label_mapping = cls.__apply_maximal_matching( cost_tensor, run_second_maximal) # Get mapped speaker labels using the mapping mapped_turns_list = [] for i, turns in enumerate(turns_list): spk_groups = { key: list(group) for key, group in groupby(turns, lambda x: x.speaker_id) } mapped_turns = [] for j, spk_id in enumerate(spk_groups.keys()): new_spk_id = label_mapping[(i, j)] for turn in spk_groups[spk_id]: mapped_turns.append( Turn( turn.onset, turn.offset, speaker_id=new_spk_id, file_id=file_id, )) mapped_turns_list.append(mapped_turns) ranks = cls.__get_ranks(weights) return mapped_turns_list, ranks