def load_data(self, split='train', load_mel_only=False): ''' Load data for training / testing''' if split == 'train': self.verbose('Loading source data ' + str(self.config['dataloader']['train_set']) + ' from ' + self.config['dataloader']['data_path']) if self.duo_feature: self.verbose('Loading target data ' + str(self.config['dataloader']['train_set']) + ' from ' + self.config['dataloader']['target_path']) elif split == 'test': self.verbose('Loading testing data ' + str(self.config['dataloader']['test_set']) + ' from ' + self.config['dataloader']['data_path']) else: raise NotImplementedError('Invalid `split` argument!') if self.duo_feature and not load_mel_only: setattr(self, 'dataloader', get_Dataloader(split, load='duo', use_gpu=self.paras.gpu, \ mock_config=self.config['mockingjay'], **self.config['dataloader'])) # Currently the duo feature dataloader only supports mockingjay training, no need to specify `run_mockingjay` else: setattr(self, 'dataloader', get_Dataloader(split, load='acoustic', use_gpu=self.paras.gpu, \ run_mockingjay=True if not load_mel_only else False, mock_config=self.config['mockingjay'], \ **self.config['dataloader'])) # specify `run_mockingjay` so dataloader will process mockingjay MAM data
def get_dataloader(args, dataloader_config): pretrain_config = torch.load(args.ckpt, map_location='cpu')['Settings']['Config'] if 'online' in pretrain_config: dataloader_config['online_config'] = pretrain_config['online'] if not os.path.exists(dataloader_config['data_path']): raise RuntimeError('[run_downstream] - Data path not valid:', dataloader_config['data_path']) print('[run_downstream] - Loading input data: ' + str(dataloader_config['train_set']) + ' from ' + dataloader_config['data_path']) if args.task == 'speaker': print('[run_downstream] - Loading speaker data: ' + str(dataloader_config['train_set']) + ' from ' + dataloader_config['data_path']) else: print('[run_downstream] - Loading phone data: ' + dataloader_config['phone_path']) if not os.path.exists(dataloader_config['phone_path']): raise RuntimeError('[run_downstream] - Phone path not valid:', dataloader_config['phone_path']) if args.task == 'montreal_phone': print('[run_downstream] - WARNING: Using a non-preset phone set! Please make sure \'data_path\' (should be: data/libri_mel160_subword5000) and \'phone_path\' (should be: data/libri_phone) are set correctly.') print('[run_downstream] - getting train dataloader...') train_loader = get_Dataloader(split='train', load=args.task, use_gpu=args.gpu, seed=args.seed, **dataloader_config) print('[run_downstream] - getting dev dataloader...') dev_loader = get_Dataloader(split='dev', load=args.task, use_gpu=args.gpu, seed=args.seed, **dataloader_config) print('[run_downstream] - getting test dataloader...') test_loader = get_Dataloader(split='test', load=args.task, use_gpu=args.gpu, seed=args.seed, **dataloader_config) return train_loader, dev_loader, test_loader
def get_dataloader(args, dataloader_config): if not os.path.exists(dataloader_config['data_path']): raise RuntimeError('[run_downstream] - Data path not valid:', dataloader_config['data_path']) print('[run_downstream] - Loading input data: ' + str(dataloader_config['train_set']) + ' from ' + dataloader_config['data_path']) if args.task == 'speaker': print('[run_downstream] - Loading speaker data: ' + str(dataloader_config['train_set']) + ' from ' + dataloader_config['data_path']) else: print('[run_downstream] - Loading phone data: ' + dataloader_config['phone_path']) print('[run_downstream] - getting train dataloader...') train_loader = get_Dataloader(split='train', load=args.task, use_gpu=args.gpu, **dataloader_config) print('[run_downstream] - getting dev dataloader...') dev_loader = get_Dataloader(split='dev', load=args.task, use_gpu=args.gpu, **dataloader_config) print('[run_downstream] - getting test dataloader...') test_loader = get_Dataloader(split='test', load=args.task, use_gpu=args.gpu, **dataloader_config) return train_loader, dev_loader, test_loader
def load_data(self): ''' Load date for training/validation''' self.verbose('Loading data from ' + self.config['solver']['data_path']) setattr(self, 'train_set', get_Dataloader('train', load='asr', use_gpu=self.paras.gpu, **self.config['solver'])) setattr(self, 'dev_set', get_Dataloader('dev',load='asr', use_gpu=self.paras.gpu, **self.config['solver'])) # Get 1 example for auto constructing model for self.sample_x, _ in getattr(self,'train_set'): break if len(self.sample_x.shape) == 4: self.sample_x = self.sample_x[0]
def load_data(self): self.verbose('Loading testing data '+str(self.config['solver']['test_set'])\ +' from '+self.config['solver']['data_path']) setattr( self, 'test_set', get_Dataloader('test', load='asr', use_gpu=self.paras.gpu, **self.config['solver'])) setattr( self, 'dev_set', get_Dataloader('dev', load='asr', use_gpu=self.paras.gpu, **self.config['solver']))
def get_dataloader(args, config): if not os.path.exists(config['dataloader']['data_path']): raise RuntimeError('[run_upstream] - Data path not valid:', config['dataloader']['data_path']) print('[run_upstream] - Loading input data: ' + str(config['dataloader']['train_set']) + ' from ' + config['dataloader']['data_path']) print('[run_upstream] - getting train dataloader...') # select mode try: if config['transformer']['dual_transformer'] and config['transformer']['wave_transformer']: raise ValueError('`dual_transformer` and `wave_transformer` can not both be True!') except: pass if 'dual_transformer' in config['transformer']: load = 'dual_acoustic' if config['transformer']['dual_transformer'] else 'acoustic' if 'wave_transformer' in config['transformer']: load = 'wave_acoustic' if config['transformer']['wave_transformer'] else 'acoustic' else: load = 'duo' if bool(config['runner']['duo_feature']) else 'kaldi' if args.kaldi_data else 'acoustic' # print path info if load == 'duo': print('[run_upstream] - Loading duo data: ' + str(config['dataloader']['train_set']) + ' from ' + config['dataloader']['target_path']) elif load == 'kaldi': print('[run_upstream] - Loading Kaldi data: ' + str(config['dataloader']['data_path']) + ' from these sets ' + str(config['dataloader']['train_set'])) elif load == 'wave_acoustic': print('[run_upstream] - Loading wave data: ' + str(config['online']['libri_root']) + ' from these sets ' + str(config['dataloader']['train_set'])) elif load == 'acoustic' and 'online' in config: print('[run_upstream] - Using online data from root: ' + str(config['online']['libri_root'])) elif load == 'acoustic': print('[run_upstream] - Loading data: ' + str(config['dataloader']['data_path']) + ' from these sets ' + str(config['dataloader']['train_set'])) dataloader = get_Dataloader(split='train', load=load, use_gpu=args.gpu, run_mam=True, mam_config=config['transformer'], **config['dataloader'], **config) return dataloader
def load_data(self): ''' Load training / dev set''' self.verbose('Loading text data from ' + self.config['solver']['data_path']) setattr( self, 'train_set', get_Dataloader('train', load='text', use_gpu=self.paras.gpu, **self.config['solver'])) setattr( self, 'dev_set', get_Dataloader('dev', load='text', use_gpu=self.paras.gpu, **self.config['solver']))
def load_data(self, split='train'): ''' Load data for training / testing''' if split == 'train': self.verbose('Loading source data from ' + str(self.config.train_set) + ' from ' + self.config.data_path) elif split == 'test': self.verbose('Loading testing data ' + str(self.config.test_set) + ' from ' + self.config.data_path) else: raise NotImplementedError('Invalid `split` argument!') setattr( self, 'dataloader', get_Dataloader(split, load='spec', data_path=self.config.data_path, batch_size=self.config.batch_size, max_timestep=3000, max_label_len=400, use_gpu=True, n_jobs=self.config.load_data_workers, train_set=self.config.train_set, dev_set=self.config.dev_set, test_set=self.config.test_set, dev_batch_size=1))
def load_data(self, split='train', load='phone'): ''' Load date for training / testing''' assert(load in ['phone', 'cpc_phone', 'sentiment', 'speaker', 'speaker_large']), 'Unsupported dataloader!' if load == 'phone' or load == 'cpc_phone' or load == 'speaker_large': if split == 'train': self.verbose('Loading source data from ' + str(self.config['dataloader']['train_set']) + ' from ' + self.config['dataloader']['data_path']) if load == 'phone' or load == 'cpc_phone': self.verbose('Loading phone data from ' + str(self.config['dataloader']['train_set']) + ' from ' + self.config['dataloader']['phone_path']) elif split == 'test': if load != 'cpc_phone': self.verbose('Loading testing data ' + str(self.config['dataloader']['test_set']) + ' from ' + self.config['dataloader']['data_path']) if load == 'phone': self.verbose('Loading label data ' + str(self.config['dataloader']['test_set']) + ' from ' + self.config['dataloader']['phone_path']) elif load == 'cpc_phone': self.verbose('Loading label data from ' + self.config['dataloader']['phone_path']) else: raise NotImplementedError('Invalid `split` argument!') elif load == 'speaker': if split == 'train': self.verbose('Loading source data from ' + str(self.config['dataloader']['train_set']).replace('360', '100') + ' from ' + self.config['dataloader']['data_path']) elif split == 'test': self.verbose('Loading testing data ' + str(self.config['dataloader']['test_set']).replace('360', '100') + ' from ' + self.config['dataloader']['data_path']) else: raise NotImplementedError('Invalid `split` argument!') elif load == 'sentiment': target = self.config['dataloader']['sentiment_config']['dataset'] sentiment_path = self.config['dataloader']['sentiment_config'][target]['path'] self.verbose(f'Loading {split} data from {sentiment_path}') else: raise NotImplementedError('Unsupported downstream tasks.') setattr(self, 'dataloader', get_Dataloader(split, load=load, use_gpu=self.paras.gpu, \ run_mockingjay=self.run_mockingjay, mock_config=self.config['mockingjay'], \ **self.config['dataloader']))
def get_dataloader(args, config): if not os.path.exists(config['dataloader']['data_path']): raise RuntimeError('[run_upstream] - Data path not valid:', config['dataloader']['data_path']) print('[run_upstream] - Loading input data: ' + str(config['dataloader']['train_set']) + ' from ' + config['dataloader']['data_path']) print('[run_upstream] - getting train dataloader...') load = 'duo' if bool(config['runner']['duo_feature'] ) else 'kaldi' if args.kaldi_data else 'acoustic' if load == 'duo': print('[run_upstream] - Loading duo data: ' + str(config['dataloader']['train_set']) + ' from ' + config['dataloader']['target_path']) if load == 'kaldi': print('[run_upstream] - Loading Kaldi data: ' + str(config['dataloader']['data_path']) + ' from these sets ' + str(config['dataloader']['train_set'])) dataloader = get_Dataloader(split='train', load=load, use_gpu=args.gpu, run_mam=True, mam_config=config['transformer'], **config['dataloader']) return dataloader
def load_data(self, split='train'): ''' Load data for training / testing''' if split == 'train': self.verbose('Loading source data ' + str(self.config['dataloader']['train_set']) + ' from ' + self.config['dataloader']['data_path']) if self.duo_feature: self.verbose('Loading target data ' + str(self.config['dataloader']['train_set']) + ' from ' + self.config['dataloader']['target_path']) elif split == 'test': self.verbose('Loading testing data ' + str(self.config['dataloader']['test_set']) + ' from ' + self.config['dataloader']['data_path']) else: raise NotImplementedError('Invalid `split` argument!') if self.duo_feature: setattr(self, 'dataloader', get_Dataloader(split, load='duo', use_gpu=self.paras.gpu, \ mam_config=self.transformer_config, **self.config['dataloader'])) # run_mam is automatically performed else: setattr(self, 'dataloader', get_Dataloader(split, load='acoustic', use_gpu=self.paras.gpu, run_mam=True, \ mam_config=self.transformer_config, **self.config['dataloader']))
def load_text(self, data_config): # Independent training set for CLM self.train_set = get_Dataloader('text', text_only=True, **data_config) self.data_iter = iter(self.train_set)