def __init__(self): # get path of model checkpoints self.chroma_path = 'models/target_chroma.hdf5' self.dmfcc_path = 'models/target_dmfcc.hdf5' self.essentia_path = 'models/target_essentia.hdf5' self.genre_path = 'models/target_genre.hdf5' self.mfcc_path = 'models/target_mfcc.hdf5' self.original_path = 'models/target_original.hdf5' self.ultimate_path = 'models/ultimate.hdf5' self.audio_path = '/crowdai-payload' #self.audio_path = '/data/minz/sample' self.ap = AudioProcessor() self.sclr = Standardizer('preprocessing/sclr.dat.gz') self.output_path = '/tmp/output.csv'
def __init__(self, csv_file, root_dir, sample_rate, num_mels, num_freq, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_quant, min_wav_len=0, max_wav_len=-1, rand_offset=True): # reads the metadata with open(csv_file, "r") as f: self.frames = [line.split(', ') for line in f] self._parse_data() self.root_dir = root_dir self.sample_rate = sample_rate # the self.min_wav_len = min_wav_len self.max_wav_len = max_wav_len if max_wav_len > 0 else inf self.rand_offset = rand_offset self.receptive_field = 2**num_quant self.ap = AudioProcessor(sample_rate, num_mels, num_freq, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db) print(" > Reading LJSpeech from - {}".format(root_dir)) print(" | > Number of instances : {}".format(len(self.frames))) print(" | > Max wav length: {}".format(self.max_wav_len)) print(" | > Min wav length: {}".format(self.min_wav_len)) print(" | > Receptive field: {}".format(self.receptive_field)) self._sort_frames()
parser = argparse.ArgumentParser() parser.add_argument('--data_path', type=str, help='Folder path to checkpoints.') parser.add_argument('--out_path', type=str, help='path to config file for training.') parser.add_argument('--config', type=str, help='conf.json file for run settings.') args = parser.parse_args() DATA_PATH = args.data_path OUT_PATH = args.out_path CONFIG = load_config(args.config) ap = AudioProcessor(CONFIG.sample_rate, CONFIG.num_mels, CONFIG.num_freq, CONFIG.min_level_db, CONFIG.frame_shift_ms, CONFIG.frame_length_ms, CONFIG.preemphasis, CONFIG.ref_level_db) def extract_mel(file_path): x, fs = sf.read(file_path) mel = ap.melspectrogram(x.astype('float32')) file_name = os.path.basename(file_path).replace(".wav","") mel_file = file_name + ".mel" np.save(os.path.join(OUT_PATH, mel_file), mel, allow_pickle=False) mel_len = mel.shape[1] wav_len = x.shape[0] return file_path, mel_file, str(wav_len), str(mel_len) glob_path = os.path.join(DATA_PATH, "**/*.wav") file_names = glob.glob(glob_path, recursive=True)
class LJSpeechDataset(Dataset): def __init__(self, csv_file, root_dir, sample_rate, num_mels, num_freq, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_quant, min_wav_len=0, max_wav_len=-1, rand_offset=True): with open(csv_file, "r") as f: self.frames = [line.split(', ') for line in f] self._parse_data() self.root_dir = root_dir self.sample_rate = sample_rate self.min_wav_len = min_wav_len self.max_wav_len = max_wav_len if max_wav_len > 0 else inf self.rand_offset = rand_offset self.receptive_field = 2 ** num_quant self.ap = AudioProcessor(sample_rate, num_mels, num_freq, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db) print(" > Reading LJSpeech from - {}".format(root_dir)) print(" | > Number of instances : {}".format(len(self.frames))) print(" | > Max wav length: {}".format(self.max_wav_len)) print(" | > Min wav length: {}".format(self.min_wav_len)) print(" | > Receptive field: {}".format(self.receptive_field)) self._sort_frames() def load_wav(self, filename): try: audio = librosa.core.load(filename, sr=self.sample_rate) return audio except RuntimeError as e: print(" !! Cannot read file : {}".format(filename)) def _parse_data(self): self.wav_files = [f[0] for f in self.frames] self.mel_files = [f[1] for f in self.frames] self.wav_lengths = [int(f[2]) for f in self.frames] self.mel_lengths = [int(f[3]) for f in self.frames] def _sort_frames(self): r"""Sort sequences in ascending order""" print(" | > Max wav length {}".format(np.max(self.wav_lengths))) print(" | > Min wav length {}".format(np.min(self.wav_lengths))) print(" | > Avg wav length {}".format(np.mean(self.wav_lengths))) idxs = np.argsort(self.wav_lengths) new_frames = [] ignored = [] for i, idx in enumerate(idxs): length = self.wav_lengths[idx] if length < self.min_wav_len: ignored.append(idx) else: new_frames.append(self.frames[idx]) print(" | > {} instances are ignored by min_wav_len ({})".format( len(ignored), self.min_wav_len)) self.frames = new_frames self._parse_data() def __len__(self): return len(self.frames) def __getitem__(self, idx): wav_name = os.path.join(self.root_dir, self.wav_files[idx]) mel_name = os.path.join(self.root_dir, self.mel_files[idx] + '.npy') mel = np.load(mel_name) mel = mel.transpose(1, 0) wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32) sample = {'mel': mel, 'wav': wav, 'item_idx': self.wav_files[idx]} return sample def collate_fn(self, batch): r""" Perform preprocessing and create a final data batch: """ # Puts each data field into a tensor with outer dimension batch size if isinstance(batch[0], collections.Mapping): keys = list() B = len(batch) item_idxs = [d['item_idx'] for d in batch] mel_lens = [d['mel'].shape[0] for d in batch] pred_lens = [np.minimum(d['wav'].shape[0] - 1, self.max_wav_len -1) for d in batch] max_len = np.max(pred_lens) + self.receptive_field - 1 if max_len > self.max_wav_len: max_len = self.max_wav_len wavs = np.zeros([B, max_len + self.receptive_field - 1]) mels = np.zeros([B, max_len + self.receptive_field - 1, self.ap.num_mels]) for idx, d in enumerate(batch): wav = d['wav'] mel = d['mel'] # mu-law encoding wav = self.ap.mulaw_encode(wav, 2**8) # align mel specs with wav by cloning frames mel = self.ap.align_feats(wav, mel) # if wav len is long, sample a starting offset if wav.shape[0] > self.max_wav_len: gap = wav.shape[0] - self.max_wav_len if self.rand_offset: offset = np.random.randint(0, gap) else: offset = 0 wav = wav[offset:offset+self.max_wav_len] mel = mel[offset:offset+self.max_wav_len] pad_w = max_len - wav.shape[0] assert wav.shape[0] == mel.shape[0] assert wav.shape[0] <= self.max_wav_len # pad left with receptive field and right with max_len in the batch wav = np.pad(wav, [self.receptive_field - 1, pad_w], mode='constant', constant_values=0.0) mel = np.pad(mel, [[self.receptive_field - 1, pad_w], [0, 0]], mode='constant', constant_values=0.0) wavs[idx] += wav mels[idx] += mel # convert things to pytorch # B x T x D mels = torch.FloatTensor(mels[:, 1:]) # B x T targets = torch.LongTensor(wavs[:, self.receptive_field:]) inputs = torch.FloatTensor(wavs[:, :-1]) pred_lens = torch.LongTensor(pred_lens) return inputs, mels, pred_lens, targets raise TypeError(("batch must contain tensors, numbers, dicts or lists;\ found {}" .format(type(batch[0]))))
class LJSpeechDataset(Dataset): """ This class extends torch.utils.data.Dataset class and for this reason it overrides the two functions: __len__ and __get_item__ in order another wrapper around it to know how to load and handle the batches. """ def __init__(self, csv_file, root_dir, sample_rate, num_mels, num_freq, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_quant, min_wav_len=0, max_wav_len=-1, rand_offset=True): # reads the metadata with open(csv_file, "r") as f: self.frames = [line.split(', ') for line in f] self._parse_data() self.root_dir = root_dir self.sample_rate = sample_rate # the self.min_wav_len = min_wav_len self.max_wav_len = max_wav_len if max_wav_len > 0 else inf self.rand_offset = rand_offset self.receptive_field = 2**num_quant self.ap = AudioProcessor(sample_rate, num_mels, num_freq, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db) print(" > Reading LJSpeech from - {}".format(root_dir)) print(" | > Number of instances : {}".format(len(self.frames))) print(" | > Max wav length: {}".format(self.max_wav_len)) print(" | > Min wav length: {}".format(self.min_wav_len)) print(" | > Receptive field: {}".format(self.receptive_field)) self._sort_frames() def load_wav(self, filename): try: audio = librosa.core.load(filename, sr=self.sample_rate) return audio except RuntimeError as e: print(" !! Cannot read file : {}".format(filename)) def _parse_data(self): self.wav_files = [f[0] for f in self.frames] self.mel_files = [f[1] for f in self.frames] self.wav_lengths = [int(f[2]) for f in self.frames] self.mel_lengths = [int(f[3]) for f in self.frames] def _sort_frames(self): r"""Sort sequences in ascending order""" print(" | > Max wav length {}".format(np.max(self.wav_lengths))) print(" | > Min wav length {}".format(np.min(self.wav_lengths))) print(" | > Avg wav length {}".format(np.mean(self.wav_lengths))) idxs = np.argsort(self.wav_lengths) new_frames = [] ignored = [] for i, idx in enumerate(idxs): length = self.wav_lengths[idx] if length < self.min_wav_len: ignored.append(idx) else: new_frames.append(self.frames[idx]) print(" | > {} instances are ignored by min_wav_len ({})".format( len(ignored), self.min_wav_len)) self.frames = new_frames self._parse_data() def __len__(self): return len(self.frames) def __getitem__(self, idx): """ :param idx: :return: """ wav_name = os.path.join(self.root_dir, self.wav_files[idx]) mel_name = os.path.join(self.root_dir, self.mel_files[idx] + '.npy') mel = np.load(mel_name) mel = mel.transpose(1, 0) wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32) sample = {'mel': mel, 'wav': wav, 'item_idx': self.wav_files[idx]} return sample def collate_fn(self, batch): """ Perform pre-processing and create a final data batch. The 'batch' is coming from the __get_item__ function. """ # Puts each data field into a tensor with outer dimension batch size if isinstance(batch[0], collections.Mapping): B = len(batch) # the length for each wave in the batch. In case the maximum defined length is less than the # actual maximum length in the data set pred_lens = [ np.minimum(d['wav'].shape[0] - 1, self.max_wav_len - 1) for d in batch ] max_len = np.max(pred_lens) + self.receptive_field - 1 if max_len > self.max_wav_len: max_len = self.max_wav_len # initialize the batch of wavs and mels to all zeros in order to have the padding later on wavs = np.zeros([B, max_len + self.receptive_field - 1]) mels = np.zeros( [B, max_len + self.receptive_field - 1, self.ap.num_mels]) # iterate the batches one by one for idx, d in enumerate(batch): wav = d['wav'] mel = d['mel'] # mu-law encoding wav = self.ap.mulaw_encode(wav, 2**8) # align mel specs with wav by cloning frames such that wav and mel have the same length mel = self.ap.align_feats(wav, mel) # if wav len is long, sample a starting offset if wav.shape[0] > self.max_wav_len: gap = wav.shape[0] - self.max_wav_len if self.rand_offset: offset = np.random.randint(0, gap) else: offset = 0 wav = wav[offset:offset + self.max_wav_len] mel = mel[offset:offset + self.max_wav_len] # calculate the padding after the end of the actual content of the wav and mel pad_w = max_len - wav.shape[0] assert wav.shape[0] == mel.shape[0] assert wav.shape[0] <= self.max_wav_len # pad with zeros from the beginning until the receptive field # pad with zeros from the end of the actual content until the maximum possible length wav = np.pad(wav, [self.receptive_field - 1, pad_w], mode='constant', constant_values=0.0) mel = np.pad(mel, [[self.receptive_field - 1, pad_w], [0, 0]], mode='constant', constant_values=0.0) # update the batch with the actual values wavs[idx] += wav mels[idx] += mel # the mels are everything from the first element onwards mels = torch.FloatTensor(mels[:, 1:]) # the target is everything from the receptive field onwards targets = torch.LongTensor(wavs[:, self.receptive_field:]) # the inputs are everything but the last element, i.e. shifted by one inputs = torch.FloatTensor(wavs[:, :-1]) pred_lens = torch.LongTensor(pred_lens) return inputs, mels, pred_lens, targets raise TypeError( ("batch must contain tensors, numbers, dicts or lists; found {}". format(type(batch[0]))))
class Transfer: def __init__(self): # get path of model checkpoints self.chroma_path = 'models/target_chroma.hdf5' self.dmfcc_path = 'models/target_dmfcc.hdf5' self.essentia_path = 'models/target_essentia.hdf5' self.genre_path = 'models/target_genre.hdf5' self.mfcc_path = 'models/target_mfcc.hdf5' self.original_path = 'models/target_original.hdf5' self.ultimate_path = 'models/ultimate.hdf5' self.audio_path = '/crowdai-payload' #self.audio_path = '/data/minz/sample' self.ap = AudioProcessor() self.sclr = Standardizer('preprocessing/sclr.dat.gz') self.output_path = '/tmp/output.csv' def run(self, gpu_id="0"): # set gpu id os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id) # set models for the feature extraction self.get_models() # get filelist self.get_filelist() # iteration self.iter_predictions() def get_models(self): # chroma _cn = Network(num_tags=40) model_chroma = _cn.model model_chroma.load_weights(self.chroma_path) self.ff_c = K.function( [model_chroma.input, K.learning_phase()], [model_chroma.layers[29].output]) # dmfcc _dn = Network(num_tags=40) model_dmfcc = _dn.model model_dmfcc.load_weights(self.dmfcc_path) self.ff_d = K.function( [model_dmfcc.input, K.learning_phase()], [model_dmfcc.layers[29].output]) # essentia _en = Network(num_tags=40) model_essentia = _en.model model_essentia.load_weights(self.essentia_path) self.ff_e = K.function( [model_essentia.input, K.learning_phase()], [model_essentia.layers[29].output]) # genre _gn = Network(num_tags=40) model_genre = _gn.model model_genre.load_weights(self.genre_path) self.ff_g = K.function( [model_genre.input, K.learning_phase()], [model_genre.layers[29].output]) # mfcc _mn = Network(num_tags=40) model_mfcc = _mn.model model_mfcc.load_weights(self.mfcc_path) self.ff_m = K.function( [model_mfcc.input, K.learning_phase()], [model_mfcc.layers[29].output]) # original _on = Network(num_tags=16) model_original = _on.model model_original.load_weights(self.original_path) self.ff_o = K.function( [model_original.input, K.learning_phase()], [model_original.layers[29].output]) # ultimate _un = NetworkT(num_tags=16) self.model_ultimate = _un.model self.model_ultimate.load_weights(self.ultimate_path) def get_filelist(self): self.filelist = glob.glob(os.path.join(self.audio_path, '*.mp3')) def get_spectrogram(self, fn): X = self.ap.mel(self.ap.forward(self.ap.load(fn)), logamp=True) num_chunk = (X.shape[0] - 43) / 10 spectrograms = [] for i in range(num_chunk): spectrograms.append(X[num_chunk * 10:num_chunk * 10 + 43].reshape( 1, 43, 128)) spectrograms = np.array(spectrograms) spectrograms = self.sclr.transform_batch(spectrograms) spectrograms = spectrograms.transpose(0, 3, 2, 1) return spectrograms def get_features(self, fn): spectrograms = self.get_spectrogram(fn) feat_c = self.ff_c([spectrograms, 0])[0] feat_d = self.ff_d([spectrograms, 0])[0] feat_e = self.ff_e([spectrograms, 0])[0] feat_g = self.ff_g([spectrograms, 0])[0] feat_m = self.ff_m([spectrograms, 0])[0] feat_o = self.ff_o([spectrograms, 0])[0] features = np.concatenate( (feat_c, feat_d, feat_e, feat_g, feat_m, feat_o), axis=1) return features def get_prediction(self, fn): features = self.get_features(fn) prd = self.model_ultimate.predict(features) prd = np.mean(prd, axis=0) return prd def iter_predictions(self): CLASSES = [ 'Blues', 'Classical', 'Country', 'Easy Listening', 'Electronic', 'Experimental', 'Folk', 'Hip-Hop', 'Instrumental', 'International', 'Jazz', 'Old-Time / Historic', 'Pop', 'Rock', 'Soul-RnB', 'Spoken' ] HEADERS = ['file_id'] + CLASSES csvfile = open(self.output_path, "w") writer = csv.DictWriter(csvfile, fieldnames=HEADERS) writer.writeheader() TEST_FILES = sorted(self.filelist) for fn in tqdm.tqdm(self.filelist): _track_id = fn.split("/")[-1].replace(".mp3", "") prd = self.get_prediction(fn) row = {} row['file_id'] = _track_id for _idx, _class in enumerate(CLASSES): row[_class] = prd[_idx] writer.writerow(row) csvfile.close() print("Output file written at ", self.output_path)