def filter_symbols(data: MergedDataset, symbols: SymbolIdDict, accent_ids: AccentsDict, speakers: SpeakersDict, allowed_symbol_ids: Set[int], logger: Logger) -> MergedDatasetContainer: # maybe check all symbol ids are valid before allowed_symbols = [symbols.get_symbol(x) for x in allowed_symbol_ids] not_allowed_symbols = [ symbols.get_symbol(x) for x in symbols.get_all_symbol_ids() if x not in allowed_symbol_ids ] logger.info( f"Keep utterances with these symbols: {' '.join(allowed_symbols)}") logger.info( f"Remove utterances with these symbols: {' '.join(not_allowed_symbols)}" ) logger.info("Statistics before filtering:") log_stats(data, symbols, accent_ids, speakers, logger) result = MergedDataset([ x for x in data.items() if contains_only_allowed_symbols( deserialize_list(x.serialized_symbol_ids), allowed_symbol_ids) ]) if len(result) > 0: logger.info( f"Removed {len(data) - len(result)} from {len(data)} total entries and got {len(result)} entries ({len(result)/len(data)*100:.2f}%)." ) else: logger.info("Removed all utterances!") new_symbol_ids = update_symbols(result, symbols) new_accent_ids = update_accents(result, accent_ids) new_speaker_ids = update_speakers(result, speakers) logger.info("Statistics after filtering:") log_stats(result, new_symbol_ids, new_accent_ids, new_speaker_ids, logger) res = MergedDatasetContainer( name=None, data=result, accent_ids=new_accent_ids, speaker_ids=new_speaker_ids, symbol_ids=new_symbol_ids, ) return res
def sims_to_csv(sims: Dict[int, List[Tuple[int, float]]], symbols: SymbolIdDict) -> pd.DataFrame: lines = [] assert len(sims) == len(symbols) for symbol_id, similarities in sims.items(): sims = [f"{symbols.get_symbol(symbol_id)}", "<=>"] for other_symbol_id, similarity in similarities: sims.append(symbols.get_symbol(other_symbol_id)) sims.append(f"{similarity:.2f}") lines.append(sims) df = pd.DataFrame(lines) return df
def plot_embeddings( symbols: SymbolIdDict, emb: torch.Tensor, logger: Logger) -> Tuple[pd.DataFrame, go.Figure, go.Figure]: assert emb.shape[0] == len(symbols) logger.info(f"Emb size {emb.shape}") logger.info(f"Sym len {len(symbols)}") sims = get_similarities(emb.numpy()) df = sims_to_csv(sims, symbols) all_symbols_sorted = [symbols.get_symbol(x) for x in range(len(symbols))] emb_normed = norm2emb(emb) fig_2d = emb_plot_2d(emb_normed, all_symbols_sorted) fig_3d = emb_plot_3d(emb_normed, all_symbols_sorted) return df, fig_2d, fig_3d