def __init__(self):
        # get path of model checkpoints
        self.chroma_path = 'models/target_chroma.hdf5'
        self.dmfcc_path = 'models/target_dmfcc.hdf5'
        self.essentia_path = 'models/target_essentia.hdf5'
        self.genre_path = 'models/target_genre.hdf5'
        self.mfcc_path = 'models/target_mfcc.hdf5'
        self.original_path = 'models/target_original.hdf5'
        self.ultimate_path = 'models/ultimate.hdf5'
        self.audio_path = '/crowdai-payload'
        #self.audio_path = '/data/minz/sample'

        self.ap = AudioProcessor()
        self.sclr = Standardizer('preprocessing/sclr.dat.gz')
        self.output_path = '/tmp/output.csv'
示例#2
0
    def __init__(self,
                 csv_file,
                 root_dir,
                 sample_rate,
                 num_mels,
                 num_freq,
                 min_level_db,
                 frame_shift_ms,
                 frame_length_ms,
                 preemphasis,
                 ref_level_db,
                 num_quant,
                 min_wav_len=0,
                 max_wav_len=-1,
                 rand_offset=True):

        # reads the metadata
        with open(csv_file, "r") as f:
            self.frames = [line.split(', ') for line in f]

        self._parse_data()
        self.root_dir = root_dir

        self.sample_rate = sample_rate

        # the
        self.min_wav_len = min_wav_len
        self.max_wav_len = max_wav_len if max_wav_len > 0 else inf
        self.rand_offset = rand_offset

        self.receptive_field = 2**num_quant

        self.ap = AudioProcessor(sample_rate, num_mels, num_freq, min_level_db,
                                 frame_shift_ms, frame_length_ms, preemphasis,
                                 ref_level_db)
        print(" > Reading LJSpeech from - {}".format(root_dir))
        print(" | > Number of instances : {}".format(len(self.frames)))
        print(" | > Max wav length: {}".format(self.max_wav_len))
        print(" | > Min wav length: {}".format(self.min_wav_len))
        print(" | > Receptive field: {}".format(self.receptive_field))
        self._sort_frames()

parser = argparse.ArgumentParser()
parser.add_argument('--data_path', type=str,
                    help='Folder path to checkpoints.')
parser.add_argument('--out_path', type=str,
                    help='path to config file for training.')
parser.add_argument('--config', type=str,
                    help='conf.json file for run settings.')
args = parser.parse_args()

DATA_PATH = args.data_path
OUT_PATH = args.out_path
CONFIG = load_config(args.config)
ap = AudioProcessor(CONFIG.sample_rate, CONFIG.num_mels, CONFIG.num_freq, CONFIG.min_level_db,
                    CONFIG.frame_shift_ms, CONFIG.frame_length_ms, CONFIG.preemphasis,
                    CONFIG.ref_level_db)         
        
def extract_mel(file_path):
    x, fs = sf.read(file_path)
    mel = ap.melspectrogram(x.astype('float32'))
    file_name = os.path.basename(file_path).replace(".wav","")
    mel_file = file_name + ".mel"
    np.save(os.path.join(OUT_PATH, mel_file), mel, allow_pickle=False)
    mel_len = mel.shape[1]
    wav_len = x.shape[0]
    return file_path, mel_file, str(wav_len), str(mel_len)

glob_path = os.path.join(DATA_PATH, "**/*.wav")
file_names = glob.glob(glob_path, recursive=True)
示例#4
0
class LJSpeechDataset(Dataset):

    def __init__(self, csv_file, root_dir, sample_rate,
                 num_mels, num_freq, min_level_db, frame_shift_ms,
                 frame_length_ms, preemphasis, ref_level_db,
                 num_quant, min_wav_len=0, max_wav_len=-1, rand_offset=True):

        with open(csv_file, "r") as f:
            self.frames = [line.split(', ') for line in f]
        self._parse_data()
        self.root_dir = root_dir
        self.sample_rate = sample_rate
        self.min_wav_len = min_wav_len
        self.max_wav_len = max_wav_len if max_wav_len > 0 else inf
        self.rand_offset = rand_offset
        self.receptive_field = 2 ** num_quant
        self.ap = AudioProcessor(sample_rate, num_mels, num_freq, min_level_db, frame_shift_ms,
                                 frame_length_ms, preemphasis, ref_level_db)
        print(" > Reading LJSpeech from - {}".format(root_dir))
        print(" | > Number of instances : {}".format(len(self.frames)))
        print(" | > Max wav length: {}".format(self.max_wav_len))
        print(" | > Min wav length: {}".format(self.min_wav_len))
        print(" | > Receptive field: {}".format(self.receptive_field))
        self._sort_frames()

    def load_wav(self, filename):
        try:
            audio = librosa.core.load(filename, sr=self.sample_rate)
            return audio
        except RuntimeError as e:
            print(" !! Cannot read file : {}".format(filename))
    
    def _parse_data(self):
        self.wav_files = [f[0] for f in self.frames]
        self.mel_files = [f[1] for f in self.frames]
        self.wav_lengths = [int(f[2]) for f in self.frames]
        self.mel_lengths = [int(f[3]) for f in self.frames]

    def _sort_frames(self):
        r"""Sort sequences in ascending order"""
        print(" | > Max wav length {}".format(np.max(self.wav_lengths)))
        print(" | > Min wav length {}".format(np.min(self.wav_lengths)))
        print(" | > Avg wav length {}".format(np.mean(self.wav_lengths)))

        idxs = np.argsort(self.wav_lengths)
        new_frames = []
        ignored = []
        for i, idx in enumerate(idxs):
            length = self.wav_lengths[idx]
            if length < self.min_wav_len:
                ignored.append(idx)
            else:
                new_frames.append(self.frames[idx])
        print(" | > {} instances are ignored by min_wav_len ({})".format(
            len(ignored), self.min_wav_len))
        self.frames = new_frames
        self._parse_data()

    def __len__(self):
        return len(self.frames)

    def __getitem__(self, idx):
        wav_name = os.path.join(self.root_dir, self.wav_files[idx])
        mel_name = os.path.join(self.root_dir, self.mel_files[idx] + '.npy')
        mel = np.load(mel_name)
        mel = mel.transpose(1, 0)
        wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32)
        sample = {'mel': mel, 'wav': wav, 'item_idx': self.wav_files[idx]}
        return sample

    def collate_fn(self, batch):
        r"""
            Perform preprocessing and create a final data batch:
        """

        # Puts each data field into a tensor with outer dimension batch size
        if isinstance(batch[0], collections.Mapping):
            keys = list()
            B = len(batch)
            item_idxs = [d['item_idx'] for d in batch]
            mel_lens = [d['mel'].shape[0] for d in batch]
            pred_lens = [np.minimum(d['wav'].shape[0] - 1, self.max_wav_len -1) for d in batch]
            max_len = np.max(pred_lens) + self.receptive_field - 1
            if max_len > self.max_wav_len:
                max_len = self.max_wav_len
            wavs = np.zeros([B, max_len + self.receptive_field - 1])
            mels = np.zeros([B, max_len + self.receptive_field - 1, self.ap.num_mels])
            for idx, d in enumerate(batch):
                wav = d['wav']
                mel = d['mel']
                # mu-law encoding
                wav = self.ap.mulaw_encode(wav, 2**8)
                # align mel specs with wav by cloning frames
                mel = self.ap.align_feats(wav, mel)
                # if wav len is long, sample a starting offset
                if wav.shape[0] > self.max_wav_len:
                    gap = wav.shape[0] - self.max_wav_len
                    if self.rand_offset:
                        offset = np.random.randint(0, gap)
                    else:
                        offset = 0
                    wav = wav[offset:offset+self.max_wav_len]
                    mel = mel[offset:offset+self.max_wav_len]
                pad_w = max_len - wav.shape[0]
                assert wav.shape[0] == mel.shape[0]
                assert wav.shape[0] <= self.max_wav_len
                # pad left with receptive field and right with max_len in the batch
                wav = np.pad(wav, [self.receptive_field - 1, pad_w], 
                             mode='constant', constant_values=0.0)
                mel = np.pad(mel, [[self.receptive_field - 1, pad_w], [0, 0]], 
                             mode='constant', constant_values=0.0)
                wavs[idx] += wav 
                mels[idx] += mel
            # convert things to pytorch
            # B x T x D
            mels = torch.FloatTensor(mels[:, 1:])
            # B x T
            targets = torch.LongTensor(wavs[:, self.receptive_field:])
            inputs = torch.FloatTensor(wavs[:, :-1])
            pred_lens = torch.LongTensor(pred_lens)
            return inputs, mels, pred_lens, targets

        raise TypeError(("batch must contain tensors, numbers, dicts or lists;\
                         found {}"
                         .format(type(batch[0]))))
示例#5
0
class LJSpeechDataset(Dataset):
    """
    This class extends torch.utils.data.Dataset class and for this reason it overrides the two functions:
    __len__ and __get_item__ in order another wrapper around it to know how to load and handle the batches.
    """
    def __init__(self,
                 csv_file,
                 root_dir,
                 sample_rate,
                 num_mels,
                 num_freq,
                 min_level_db,
                 frame_shift_ms,
                 frame_length_ms,
                 preemphasis,
                 ref_level_db,
                 num_quant,
                 min_wav_len=0,
                 max_wav_len=-1,
                 rand_offset=True):

        # reads the metadata
        with open(csv_file, "r") as f:
            self.frames = [line.split(', ') for line in f]

        self._parse_data()
        self.root_dir = root_dir

        self.sample_rate = sample_rate

        # the
        self.min_wav_len = min_wav_len
        self.max_wav_len = max_wav_len if max_wav_len > 0 else inf
        self.rand_offset = rand_offset

        self.receptive_field = 2**num_quant

        self.ap = AudioProcessor(sample_rate, num_mels, num_freq, min_level_db,
                                 frame_shift_ms, frame_length_ms, preemphasis,
                                 ref_level_db)
        print(" > Reading LJSpeech from - {}".format(root_dir))
        print(" | > Number of instances : {}".format(len(self.frames)))
        print(" | > Max wav length: {}".format(self.max_wav_len))
        print(" | > Min wav length: {}".format(self.min_wav_len))
        print(" | > Receptive field: {}".format(self.receptive_field))
        self._sort_frames()

    def load_wav(self, filename):
        try:
            audio = librosa.core.load(filename, sr=self.sample_rate)
            return audio
        except RuntimeError as e:
            print(" !! Cannot read file : {}".format(filename))

    def _parse_data(self):

        self.wav_files = [f[0] for f in self.frames]
        self.mel_files = [f[1] for f in self.frames]
        self.wav_lengths = [int(f[2]) for f in self.frames]
        self.mel_lengths = [int(f[3]) for f in self.frames]

    def _sort_frames(self):
        r"""Sort sequences in ascending order"""
        print(" | > Max wav length {}".format(np.max(self.wav_lengths)))
        print(" | > Min wav length {}".format(np.min(self.wav_lengths)))
        print(" | > Avg wav length {}".format(np.mean(self.wav_lengths)))

        idxs = np.argsort(self.wav_lengths)
        new_frames = []
        ignored = []
        for i, idx in enumerate(idxs):
            length = self.wav_lengths[idx]
            if length < self.min_wav_len:
                ignored.append(idx)
            else:
                new_frames.append(self.frames[idx])
        print(" | > {} instances are ignored by min_wav_len ({})".format(
            len(ignored), self.min_wav_len))
        self.frames = new_frames
        self._parse_data()

    def __len__(self):
        return len(self.frames)

    def __getitem__(self, idx):
        """
        
        :param idx: 
        :return: 
        """

        wav_name = os.path.join(self.root_dir, self.wav_files[idx])
        mel_name = os.path.join(self.root_dir, self.mel_files[idx] + '.npy')
        mel = np.load(mel_name)
        mel = mel.transpose(1, 0)
        wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32)
        sample = {'mel': mel, 'wav': wav, 'item_idx': self.wav_files[idx]}
        return sample

    def collate_fn(self, batch):
        """
            Perform pre-processing and create a final data batch. The 'batch' is coming from the __get_item__ function.
        """

        # Puts each data field into a tensor with outer dimension batch size
        if isinstance(batch[0], collections.Mapping):
            B = len(batch)

            # the length for each wave in the batch. In case the maximum defined length is less than the
            # actual maximum length in the data set
            pred_lens = [
                np.minimum(d['wav'].shape[0] - 1, self.max_wav_len - 1)
                for d in batch
            ]
            max_len = np.max(pred_lens) + self.receptive_field - 1

            if max_len > self.max_wav_len:
                max_len = self.max_wav_len

            # initialize the batch of wavs and mels to all zeros in order to have the padding later on
            wavs = np.zeros([B, max_len + self.receptive_field - 1])
            mels = np.zeros(
                [B, max_len + self.receptive_field - 1, self.ap.num_mels])

            # iterate the batches one by one
            for idx, d in enumerate(batch):
                wav = d['wav']
                mel = d['mel']

                # mu-law encoding
                wav = self.ap.mulaw_encode(wav, 2**8)

                # align mel specs with wav by cloning frames such that wav and mel have the same length
                mel = self.ap.align_feats(wav, mel)

                # if wav len is long, sample a starting offset
                if wav.shape[0] > self.max_wav_len:
                    gap = wav.shape[0] - self.max_wav_len
                    if self.rand_offset:
                        offset = np.random.randint(0, gap)
                    else:
                        offset = 0
                    wav = wav[offset:offset + self.max_wav_len]
                    mel = mel[offset:offset + self.max_wav_len]

                # calculate the padding after the end of the actual content of the wav and mel
                pad_w = max_len - wav.shape[0]
                assert wav.shape[0] == mel.shape[0]
                assert wav.shape[0] <= self.max_wav_len

                # pad with zeros from the beginning until the receptive field
                # pad with zeros from the end of the actual content until the maximum possible length
                wav = np.pad(wav, [self.receptive_field - 1, pad_w],
                             mode='constant',
                             constant_values=0.0)
                mel = np.pad(mel, [[self.receptive_field - 1, pad_w], [0, 0]],
                             mode='constant',
                             constant_values=0.0)

                # update the batch with the actual values
                wavs[idx] += wav
                mels[idx] += mel

            # the mels are everything from the first element onwards
            mels = torch.FloatTensor(mels[:, 1:])

            # the target is everything from the receptive field onwards
            targets = torch.LongTensor(wavs[:, self.receptive_field:])

            # the inputs are everything but the last element, i.e. shifted by one
            inputs = torch.FloatTensor(wavs[:, :-1])
            pred_lens = torch.LongTensor(pred_lens)
            return inputs, mels, pred_lens, targets

        raise TypeError(
            ("batch must contain tensors, numbers, dicts or lists; found {}".
             format(type(batch[0]))))
class Transfer:
    def __init__(self):
        # get path of model checkpoints
        self.chroma_path = 'models/target_chroma.hdf5'
        self.dmfcc_path = 'models/target_dmfcc.hdf5'
        self.essentia_path = 'models/target_essentia.hdf5'
        self.genre_path = 'models/target_genre.hdf5'
        self.mfcc_path = 'models/target_mfcc.hdf5'
        self.original_path = 'models/target_original.hdf5'
        self.ultimate_path = 'models/ultimate.hdf5'
        self.audio_path = '/crowdai-payload'
        #self.audio_path = '/data/minz/sample'

        self.ap = AudioProcessor()
        self.sclr = Standardizer('preprocessing/sclr.dat.gz')
        self.output_path = '/tmp/output.csv'

    def run(self, gpu_id="0"):
        # set gpu id
        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
        # set models for the feature extraction
        self.get_models()
        # get filelist
        self.get_filelist()
        # iteration
        self.iter_predictions()

    def get_models(self):
        # chroma
        _cn = Network(num_tags=40)
        model_chroma = _cn.model
        model_chroma.load_weights(self.chroma_path)
        self.ff_c = K.function(
            [model_chroma.input, K.learning_phase()],
            [model_chroma.layers[29].output])
        # dmfcc
        _dn = Network(num_tags=40)
        model_dmfcc = _dn.model
        model_dmfcc.load_weights(self.dmfcc_path)
        self.ff_d = K.function(
            [model_dmfcc.input, K.learning_phase()],
            [model_dmfcc.layers[29].output])
        # essentia
        _en = Network(num_tags=40)
        model_essentia = _en.model
        model_essentia.load_weights(self.essentia_path)
        self.ff_e = K.function(
            [model_essentia.input, K.learning_phase()],
            [model_essentia.layers[29].output])
        # genre
        _gn = Network(num_tags=40)
        model_genre = _gn.model
        model_genre.load_weights(self.genre_path)
        self.ff_g = K.function(
            [model_genre.input, K.learning_phase()],
            [model_genre.layers[29].output])
        # mfcc
        _mn = Network(num_tags=40)
        model_mfcc = _mn.model
        model_mfcc.load_weights(self.mfcc_path)
        self.ff_m = K.function(
            [model_mfcc.input, K.learning_phase()],
            [model_mfcc.layers[29].output])
        # original
        _on = Network(num_tags=16)
        model_original = _on.model
        model_original.load_weights(self.original_path)
        self.ff_o = K.function(
            [model_original.input, K.learning_phase()],
            [model_original.layers[29].output])
        # ultimate
        _un = NetworkT(num_tags=16)
        self.model_ultimate = _un.model
        self.model_ultimate.load_weights(self.ultimate_path)

    def get_filelist(self):
        self.filelist = glob.glob(os.path.join(self.audio_path, '*.mp3'))

    def get_spectrogram(self, fn):
        X = self.ap.mel(self.ap.forward(self.ap.load(fn)), logamp=True)
        num_chunk = (X.shape[0] - 43) / 10
        spectrograms = []
        for i in range(num_chunk):
            spectrograms.append(X[num_chunk * 10:num_chunk * 10 + 43].reshape(
                1, 43, 128))
        spectrograms = np.array(spectrograms)
        spectrograms = self.sclr.transform_batch(spectrograms)
        spectrograms = spectrograms.transpose(0, 3, 2, 1)
        return spectrograms

    def get_features(self, fn):
        spectrograms = self.get_spectrogram(fn)
        feat_c = self.ff_c([spectrograms, 0])[0]
        feat_d = self.ff_d([spectrograms, 0])[0]
        feat_e = self.ff_e([spectrograms, 0])[0]
        feat_g = self.ff_g([spectrograms, 0])[0]
        feat_m = self.ff_m([spectrograms, 0])[0]
        feat_o = self.ff_o([spectrograms, 0])[0]
        features = np.concatenate(
            (feat_c, feat_d, feat_e, feat_g, feat_m, feat_o), axis=1)
        return features

    def get_prediction(self, fn):
        features = self.get_features(fn)
        prd = self.model_ultimate.predict(features)
        prd = np.mean(prd, axis=0)
        return prd

    def iter_predictions(self):
        CLASSES = [
            'Blues', 'Classical', 'Country', 'Easy Listening', 'Electronic',
            'Experimental', 'Folk', 'Hip-Hop', 'Instrumental', 'International',
            'Jazz', 'Old-Time / Historic', 'Pop', 'Rock', 'Soul-RnB', 'Spoken'
        ]
        HEADERS = ['file_id'] + CLASSES
        csvfile = open(self.output_path, "w")
        writer = csv.DictWriter(csvfile, fieldnames=HEADERS)
        writer.writeheader()
        TEST_FILES = sorted(self.filelist)

        for fn in tqdm.tqdm(self.filelist):
            _track_id = fn.split("/")[-1].replace(".mp3", "")
            prd = self.get_prediction(fn)
            row = {}
            row['file_id'] = _track_id
            for _idx, _class in enumerate(CLASSES):
                row[_class] = prd[_idx]
            writer.writerow(row)
        csvfile.close()
        print("Output file written at ", self.output_path)