def load_textset(n_jobs, use_gpu, pin_memory, corpus, text): # Text tokenizer tokenizer = load_text_encoder(**text) # Dataset tr_set, dv_set, tr_loader_bs, dv_loader_bs, data_msg = create_textset( tokenizer, **corpus) collect_tr = partial(collect_text_batch, mode='train') collect_dv = partial(collect_text_batch, mode='dev') # Dataloader (Text data stored in RAM, no need num_workers) tr_set = DataLoader(tr_set, batch_size=tr_loader_bs, shuffle=True, drop_last=True, collate_fn=collect_tr, num_workers=0, pin_memory=use_gpu) dv_set = DataLoader(dv_set, batch_size=dv_loader_bs, shuffle=False, drop_last=False, collate_fn=collect_dv, num_workers=0, pin_memory=pin_memory) # Messages to show data_msg.append('I/O spec. | Token type = {}\t| Vocab size = {}'.format( tokenizer.token_type, tokenizer.vocab_size)) return tr_set, dv_set, tokenizer.vocab_size, tokenizer, data_msg
def load_dataset(n_jobs, use_gpu, pin_memory, ascending, corpus, audio, text): ''' Prepare dataloader for training/validation''' # Audio feature extractor audio_transform, feat_dim = create_transform(audio.copy()) # Text tokenizer tokenizer = load_text_encoder(**text) # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set) tr_set, dv_set, tr_loader_bs, dv_loader_bs, mode, data_msg = create_dataset( tokenizer, ascending, **corpus) # Collect function collect_tr = partial(collect_audio_batch, audio_transform=audio_transform, mode=mode) collect_dv = partial(collect_audio_batch, audio_transform=audio_transform, mode='test') # Shuffle/drop applied to training set only shuffle = (mode == 'train' and not ascending) drop_last = shuffle # Create data loader tr_set = DataLoader(tr_set, batch_size=tr_loader_bs, shuffle=shuffle, drop_last=drop_last, collate_fn=collect_tr, num_workers=n_jobs, pin_memory=use_gpu) dv_set = DataLoader(dv_set, batch_size=dv_loader_bs, shuffle=False, drop_last=False, collate_fn=collect_dv, num_workers=n_jobs, pin_memory=pin_memory) # Messages to show data_msg.append('I/O spec. | Audio feature = {}\t| feature dim = {}\t| Token type = {}\t| Vocab size = {}' .format(audio['feat_type'], feat_dim, tokenizer.token_type, tokenizer.vocab_size)) return tr_set, dv_set, feat_dim, tokenizer.vocab_size, tokenizer, data_msg
def load_noisy_dataset(job, input, n_jobs, use_gpu, pin_memory, ascending, corpus, audio, text): ''' Prepare dataloader for training/validation''' # Audio feature extractor audio_transform, feat_dim = create_transform(audio.copy()) # Text tokenizer tokenizer = load_text_encoder(**text) # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set) tr_set, tr_loader_bs, mode = create_noisy_dataset(job, input, tokenizer, ascending, **corpus) # Collect function collect_tr = partial(collect_audio_batch, audio_transform=audio_transform, mode=mode) # Shuffle/drop applied to training set only shuffle = (mode == 'train' and not ascending) drop_last = shuffle # Create data loader tr_set = DataLoader(tr_set, batch_size=tr_loader_bs, shuffle=shuffle, drop_last=drop_last, collate_fn=collect_tr, num_workers=n_jobs, pin_memory=use_gpu) return tr_set, feat_dim, tokenizer.vocab_size, tokenizer
def repro_load_dataset(n_jobs, use_gpu, pin_memory, ascending, corpus, audio, text): ''' Prepare dataloader for testing''' # Audio feature extractor audio_transform, feat_dim = create_transform(audio.copy()) # Text tokenizer tokenizer = load_text_encoder(**text) # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set) from corpus.dlhlp import DlhlpDataset as Dataset dv_set = Dataset(corpus['path'], corpus['test_split'], tokenizer, 1) # Collect function collect_dv = partial(collect_audio_batch, audio_transform=audio_transform, mode='test') # Create data loader dv_set = DataLoader(dv_set, batch_size=1, shuffle=False, drop_last=False, collate_fn=collect_dv, num_workers=n_jobs, pin_memory=pin_memory) msg = 'I/O spec. | Audio feature = {}\t| feature dim = {}\t| Token type = {}\t| Vocab size = {}'.format( audio['feat_type'], feat_dim, tokenizer.token_type, tokenizer.vocab_size) return dv_set, feat_dim, tokenizer.vocab_size, tokenizer, msg
def load_test_dataset(n_jobs, use_gpu, pin_memory, ascending, corpus, audio, text): ''' Prepare dataloader for training/validation''' def _create_dataset(tokenizer, ascending, name, path, bucketing, batch_size, train_split=None, dev_split=None, test_split=None): ''' Interface for creating all kinds of dataset''' # Recognize corpus if name.lower() == "librispeech": from corpus.librispeech import LibriDataset as Dataset elif name.lower() == "dlhlp": from corpus.dlhlp import DlhlpDataset as Dataset elif name.lower() == 'external': from corpus.external import ExternalDataset as Dataset else: raise NotImplementedError # Testing model mode = 'test' # Do not use bucketing for test set tt_set = Dataset(path, test_split, tokenizer, 1) # Messages to show return tt_set, batch_size, mode, [] # Audio feature extractor audio_transform, feat_dim = create_transform(audio.copy()) # Text tokenizer tokenizer = load_text_encoder(**text) # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set) dv_set, dv_loader_bs, mode, data_msg = _create_dataset( tokenizer, ascending, **corpus) # Collect function collect_dv = partial(collect_audio_batch, audio_transform=audio_transform, mode='test') # Shuffle/drop applied to training set only shuffle = (mode == 'train' and not ascending) # Create data loader dv_set = DataLoader(dv_set, batch_size=dv_loader_bs, shuffle=False, drop_last=False, collate_fn=collect_dv, num_workers=n_jobs, pin_memory=pin_memory) # Messages to show data_msg.append( 'I/O spec. | Audio feature = {}\t| feature dim = {}\t| Token type = {}\t| Vocab size = {}' .format(audio['feat_type'], feat_dim, tokenizer.token_type, tokenizer.vocab_size)) return dv_set, feat_dim, tokenizer.vocab_size, tokenizer, data_msg
def transfer_with_mapping(self, ckpt, transfer_config, cur_tokenizer): ''' Transfer ctc layer weight to new one by method = - "no": not transfer - "ipa": transfer by ipa ground truth - "mapping":transfer by mapping ''' # Load src model weights device = list(self.encoder.parameters())[0].device #ckpt_path = transfer_config.pop('src_ckpt') #ckpt = torch.load( # ckpt_path, map_location=device) old_weights = ckpt['model'].pop('ctc_layer.weight') old_bias = ckpt['model'].pop('ctc_layer.bias') self.encoder.load_state_dict({ n[8:]: v for n, v in ckpt['model'].items() if n.startswith('encoder.') }) #del ckpt # Transfer weights method = transfer_config.pop('method') mapping = transfer_config.pop('mapping', None) self.init_ctclayer() if method == 'no': pass elif method in ['ipa', 'mapping']: old_vocab2idx = load_text_encoder(**transfer_config)._vocab2idx if method == 'ipa': # target --> src mapping = {v: v for v in cur_tokenizer._vocab_list} else: with open(mapping, 'r') as f: mapping = json.load(f) mapping = { tar_v: src_v for tar_v, src_v in mapping.items() if tar_v in cur_tokenizer._vocab_list } for tar_v, src_v in mapping.items(): tar_i = cur_tokenizer._vocab2idx[tar_v] src_i = old_vocab2idx.get(src_v, None) if src_i is not None: self.ctc_layer.weight.data[tar_i].copy_( old_weights.data[src_i]) self.ctc_layer.bias.data[tar_i].copy_(old_bias.data[src_i]) else: raise ValueError(f'Not supporting method {method}') msg = f"Tranfsering weight from old CTCLayer with method {method}" return msg
def load_dataset(n_jobs, use_gpu, pin_memory, ascending, corpus, audio, text): ''' Prepare dataloader for training/validation''' # Audio feature extractor audio_transform, feat_dim = create_transform(audio.copy()) ## from src.audio import create_transform ''' Returns a pytorch seq module dealing audio transform ''' # Text tokenizer tokenizer = load_text_encoder(**text) ## from src.text import load_text_encoder # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set) '''
def load_dataset(n_jobs, use_gpu, pin_memory, corpus, audio, inference_stage=False): ''' Prepare audio dataloader for solver ''' test_set = None # Meta-data related data_msg = [] ### Audio converter (for all kinds of transform/inverse-transform) audio_converter = load_audio_transform(**audio) data_msg.append('Audio spec.| Feature type = {}\t\t| Feature dim = {}'\ .format(audio_converter.feat_type,audio_converter.feat_dim)) ### Text loader (if exist, return ground truth phone sequence) tokenizer = load_text_encoder('phoneme', vocab_file=corpus['vocab_file'], map_table=corpus['map_table']) data_msg.append('Text spec. | Token type = {}\t| Vocab size = {}'\ .format(tokenizer.token_type,tokenizer.vocab_size)) # Date related ### Load all dataset unpair_set, pair_set, dev_set, test_set, set_msg = create_dataset( **corpus, inference_stage=inference_stage) data_msg.extend(set_msg) ### Create dataloader tr_collect = partial(collect_fn, audio_converter=audio_converter, text_loader=tokenizer, mode='train') dv_collect = partial(collect_fn, audio_converter=audio_converter, text_loader=tokenizer, mode='dev') unpair_set = DataLoader( unpair_set, batch_size=unpair_set.bs_for_collate, shuffle=not inference_stage, drop_last=not inference_stage, collate_fn=dv_collect if inference_stage else tr_collect, num_workers=max(0, n_jobs), pin_memory=pin_memory, worker_init_fn=_worker_init) pair_set = DataLoader( pair_set, batch_size=pair_set.bs_for_collate, shuffle=not inference_stage, drop_last=not inference_stage, collate_fn=dv_collect if inference_stage else tr_collect, num_workers=max(0, n_jobs), pin_memory=pin_memory, worker_init_fn=_worker_init) dev_set = DataLoader(dev_set, batch_size=dev_set.bs_for_collate, shuffle=False, drop_last=False, collate_fn=dv_collect, num_workers=max(0, n_jobs), pin_memory=pin_memory, worker_init_fn=_worker_init) if inference_stage: test_set = DataLoader(test_set, batch_size=test_set.bs_for_collate, shuffle=False, drop_last=False, collate_fn=dv_collect, num_workers=max(0, n_jobs), pin_memory=pin_memory, worker_init_fn=_worker_init) # Augmentation data_msg.append('Augment | Speed rate = {}\t| S/N rate = {}'\ .format( audio_converter.time_stretch_range, audio_converter.snr_range)) return unpair_set, pair_set, dev_set, test_set, audio_converter, tokenizer, data_msg
def load_dataset(n_jobs, use_gpu, pin_memory, ascending, corpus, audio, text): ''' Prepare dataloader for training/validation''' # Audio feature extractor '''convert to mel-spectrogram''' audio_transform_tr, feat_dim = create_transform(audio.copy(), 'train') audio_transform_dv, feat_dim = create_transform(audio.copy(), 'dev') '''add augment function here''' #print(audio_transform_tr) #augment = Augment() #audio_transform_tr = Augment(audio_transform_tr) #### not augment dvset # Text tokenizer tokenizer = load_text_encoder(**text) # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set) tr_set, dv_set, tr_loader_bs, dv_loader_bs, mode, data_msg = create_dataset( tokenizer, ascending, **corpus) # Collect function collect_tr = partial(collect_audio_batch, audio_transform=audio_transform_tr, mode=mode) collect_dv = partial(collect_audio_batch, audio_transform=audio_transform_dv, mode='test') # Shuffle/drop applied to training set only shuffle = (mode == 'train' and not ascending) drop_last = shuffle # Create data loader #print(tr_loader_bs) tr_set = DataLoader(tr_set, batch_size=tr_loader_bs, shuffle=shuffle, drop_last=drop_last, collate_fn=collect_tr, num_workers=n_jobs, pin_memory=use_gpu) if type(dv_set) is list: _tmp_set = [] for ds in dv_set: _tmp_set.append( DataLoader(ds, batch_size=dv_loader_bs, shuffle=False, drop_last=False, collate_fn=collect_dv, num_workers=n_jobs, pin_memory=pin_memory)) dv_set = _tmp_set else: dv_set = DataLoader(dv_set, batch_size=dv_loader_bs, shuffle=False, drop_last=False, collate_fn=collect_dv, num_workers=n_jobs, pin_memory=pin_memory) # Messages to show data_msg.append('I/O spec. | Audio Feature = {}\t| Feature Dim = {}\t| Token Type = {}\t| Vocab Size = {}'\ .format(audio['feat_type'],feat_dim,tokenizer.token_type,tokenizer.vocab_size)) return tr_set, dv_set, feat_dim, tokenizer.vocab_size, tokenizer, data_msg
def load_wav_dataset(n_jobs, use_gpu, pin_memory, ascending, corpus, audio, text): # Text tokenizer tokenizer = load_text_encoder(**text) # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set) tr_set, dv_set, tr_loader_bs, dv_loader_bs, mode, data_msg = create_dataset( tokenizer, ascending, text['mode'], **corpus) # If mode == 'train', tr_set is the train set, dv_set is the development set # If mode == 'eval', tr_set is the development set, dv_set is the test set # Audio reader tr_audio_reader = ReadAudio(SAMPLE_RATE, mode=mode, time_aug=audio['time_aug']) dv_audio_reader = ReadAudio(SAMPLE_RATE, mode='eval', time_aug=audio['time_aug']) # Collect function collect_tr = partial(collect_wav_batch, audio_reader=tr_audio_reader, mode=mode) collect_dv = partial(collect_wav_batch, audio_reader=dv_audio_reader, mode='eval') # Shuffle/drop applied to training set only shuffle = (mode == 'train' and not ascending) drop_last = shuffle # Create data loader tr_set = DataLoader(tr_set, batch_size=tr_loader_bs, shuffle=shuffle, drop_last=drop_last, collate_fn=collect_tr, num_workers=n_jobs, pin_memory=use_gpu) if type(dv_set) is list: _tmp_set = [] for ds in dv_set: _tmp_set.append( DataLoader(ds, batch_size=dv_loader_bs, shuffle=False, drop_last=False, collate_fn=collect_dv, num_workers=n_jobs, pin_memory=pin_memory)) dv_set = _tmp_set else: dv_set = DataLoader(dv_set, batch_size=dv_loader_bs, shuffle=False, drop_last=False, collate_fn=collect_dv, num_workers=n_jobs, pin_memory=pin_memory) return tr_set, dv_set, tokenizer.vocab_size, tokenizer, data_msg