Пример #1
0
    def __init__(self,
                 feature_hdf5_path,
                 feature_hdf5_path_left,
                 feature_hdf5_path_right,
                 feature_hdf5_path_side,
                 train_csv,
                 validate_csv,
                 batch_size,
                 seed=1234):
        '''Data generator for training and validation. 
        
        Args:
          feature_hdf5_path: string, path of hdf5 feature file
          train_csv: string, path of train csv file
          validate_csv: string, path of validate csv file
          scalar: object, containing mean and std value
          batch_size: int
          seed: int, random seed
        '''

        self.batch_size = batch_size
        self.random_state = np.random.RandomState(seed)

        # self.classes_num = classes_num
        self.in_domain_classes_num = len(config.labels) - 1
        self.all_classes_num = len(config.labels)
        self.lb_to_idx = config.lb_to_idx
        self.idx_to_lb = config.idx_to_lb

        # Load training data
        load_time = time.time()

        self.data_dict = self.load_hdf5(feature_hdf5_path)
        self.data_dict_left = self.load_hdf5_left(feature_hdf5_path_left)
        self.data_dict_right = self.load_hdf5_right(feature_hdf5_path_right)
        self.data_dict_side = self.load_hdf5_side(feature_hdf5_path_side)
        train_meta = read_metadata(train_csv)
        validate_meta = read_metadata(validate_csv)

        self.train_audio_indexes = self.get_audio_indexes(
            train_meta, self.data_dict, 'train')

        self.validate_audio_indexes = self.get_audio_indexes(
            validate_meta, self.data_dict, 'validate')

        logging.info('Load data time: {:.3f} s'.format(time.time() -
                                                       load_time))
        logging.info('Training audio num: {}'.format(
            len(self.train_audio_indexes)))
        logging.info('Validation audio num: {}'.format(
            len(self.validate_audio_indexes)))
    def __init__(self, feature_hdf5_path, train_csv, validate_csv, holdout_fold, 
        scalar, batch_size, seed=1234):
        '''Data generator for training and validation. 
        
        Args:
          feature_hdf5_path: string, path of hdf5 feature file
          train_csv: string, path of train csv file
          validate_csv: string, path of validate csv file
          holdout_fold: set 1 for development and none for training 
              on all data without validation
          scalar: object, containing mean and std value
          batch_size: int
          seed: int, random seed
        '''

        self.scalar = scalar
        self.batch_size = batch_size
        self.random_state = np.random.RandomState(seed)
        
        # self.classes_num = classes_num
        self.in_domain_classes_num = len(config.labels) - 1
        self.all_classes_num = len(config.labels) - 1
        self.lb_to_idx = config.lb_to_idx
        self.idx_to_lb = config.idx_to_lb
        
        # Load training data
        load_time = time.time()
        
        self.data_dict = self.load_hdf5(feature_hdf5_path)
        
        train_meta = read_metadata(train_csv)
        validate_meta = read_metadata(validate_csv)

        self.train_audio_indexes = self.get_audio_indexes(
            train_meta, self.data_dict, holdout_fold, 'train')
            
        self.validate_audio_indexes = self.get_audio_indexes(
            validate_meta, self.data_dict, holdout_fold, 'validate')
            
        if holdout_fold == 'none':
            self.train_audio_indexes = np.concatenate(
                (self.train_audio_indexes, self.validate_audio_indexes), axis=0)
                
            self.validate_audio_indexes = np.array([])
        
        logging.info('Load data time: {:.3f} s'.format(time.time() - load_time))
        logging.info('Training audio num: {}'.format(len(self.train_audio_indexes)))            
        logging.info('Validation audio num: {}'.format(len(self.validate_audio_indexes)))
        
        self.random_state.shuffle(self.train_audio_indexes)
        self.pointer = 0
Пример #3
0
def get_playground_mask(img_path):
    img = cv2.imread(img_path)
    meta = read_metadata(img_path)
    if "playground_poly" not in meta:
        playground_poly = utilities.select_polygon(img)
        meta = update_metadata(img_path, {"playground_poly": playground_poly})
    playground_mask = utilities.poly2mask(meta["playground_poly"], img)
    return playground_mask
Пример #4
0
def calculate_feature_for_all_audio_files(args):
    '''Calculate feature of audio files and write out features to a hdf5 file. 
    
    Args:
      dataset_dir: string
      workspace: string
      subtask: 'a' | 'b' | 'c'
      data_type: 'development' | 'evaluation'
      mini_data: bool, set True for debugging on a small part of data
    '''

    # Arguments & parameters
    dataset_dir = args.dataset_dir
    workspace = args.workspace
    subtask = args.subtask
    data_type = args.data_type
    mini_data = args.mini_data

    sample_rate = config.sample_rate
    window_size = config.window_size
    hop_size = config.hop_size
    mel_bins = config.mel_bins
    fmin = config.fmin
    fmax = config.fmax
    frames_per_second = config.frames_per_second
    frames_num = config.frames_num
    total_samples = config.total_samples
    lb_to_idx = config.lb_to_idx

    # Paths
    if mini_data:
        prefix = 'minidata_'
    else:
        prefix = ''

    sub_dir = get_subdir(subtask, data_type)
    metadata_path = os.path.join(dataset_dir, sub_dir, 'meta.csv')
    audios_dir = os.path.join(dataset_dir, sub_dir, 'audio')

    feature_path = os.path.join(
        workspace, 'features_side',
        '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second,
                                             mel_bins),
        '{}.h5'.format(sub_dir))
    create_folder(os.path.dirname(feature_path))

    # Feature extractor
    feature_extractor = LogMelExtractor(sample_rate=sample_rate,
                                        window_size=window_size,
                                        hop_size=hop_size,
                                        mel_bins=mel_bins,
                                        fmin=fmin,
                                        fmax=fmax)

    # Read metadata
    meta_dict = read_metadata(metadata_path)

    # Extract features and targets
    if mini_data:
        mini_num = 10
        total_num = len(meta_dict['audio_name'])
        random_state = np.random.RandomState(1234)
        indexes = random_state.choice(total_num, size=mini_num, replace=False)
        meta_dict['audio_name'] = meta_dict['audio_name'][indexes]
        meta_dict['scene_label'] = meta_dict['scene_label'][indexes]
        meta_dict['identifier'] = meta_dict['identifier'][indexes]
        meta_dict['source_label'] = meta_dict['source_label'][indexes]

    print('Extracting features of all audio files ...')
    extract_time = time.time()

    # Hdf5 file for storing features and targets
    hf = h5py.File(feature_path, 'w')

    hf.create_dataset(
        name='audio_name',
        data=[audio_name.encode() for audio_name in meta_dict['audio_name']],
        dtype='S80')

    if 'scene_label' in meta_dict.keys():
        hf.create_dataset(name='scene_label',
                          data=[
                              scene_label.encode()
                              for scene_label in meta_dict['scene_label']
                          ],
                          dtype='S24')

    if 'identifier' in meta_dict.keys():
        hf.create_dataset(name='identifier',
                          data=[
                              identifier.encode()
                              for identifier in meta_dict['identifier']
                          ],
                          dtype='S24')

    if 'source_label' in meta_dict.keys():
        hf.create_dataset(name='source_label',
                          data=[
                              source_label.encode()
                              for source_label in meta_dict['source_label']
                          ],
                          dtype='S8')

    hf.create_dataset(name='feature_side',
                      shape=(0, frames_num, mel_bins),
                      maxshape=(None, frames_num, mel_bins),
                      dtype=np.float32)

    for (n, audio_name) in enumerate(meta_dict['audio_name']):
        audio_path = os.path.join(audios_dir, audio_name)
        print(n, audio_path)

        # Read audio
        (audio, _) = read_side_audio(audio_path=audio_path,
                                     target_fs=sample_rate)

        # Pad or truncate audio recording to the same length
        audio = pad_truncate_sequence(audio, total_samples)

        # Extract feature
        feature = feature_extractor.transform(audio)

        # Remove the extra log mel spectrogram frames caused by padding zero
        feature = feature[0:frames_num]

        hf['feature_side'].resize((n + 1, frames_num, mel_bins))
        hf['feature_side'][n] = feature

    hf.close()

    print('Write hdf5 file to {} using {:.3f} s'.format(
        feature_path,
        time.time() - extract_time))
Пример #5
0
def calculate_feature_for_all_audio_files(args):
    '''Calculate feature of audio files and write out features to a hdf5 file. 
    
    Args:
      dataset_dir: string
      workspace: string
      data_type: 'train_curated', 'train_noisy', 'test'
      mini_data: bool, set True for debugging on a small part of data
    '''

    # Arguments & parameters
    dataset_dir = DATASET_DIR
    workspace = WORKSPACE
    data_type = args.data_type
    mini_data = args.mini_data

    sample_rate = config.sample_rate
    window_size = config.window_size
    hop_size = config.hop_size
    mel_bins = config.mel_bins
    fmin = config.fmin
    fmax = config.fmax
    frames_per_second = config.frames_per_second
    lb_to_idx = config.lb_to_idx

    # Paths
    if mini_data:
        prefix = 'minidata_'
    else:
        prefix = ''

    if data_type in ['train_curated', 'train_noisy']:
        metadata_path = os.path.join(dataset_dir, '{}.csv'.format(data_type))
    else:
        pass

    audios_dir = os.path.join(dataset_dir, data_type)

    feature_path = os.path.join(
        workspace, 'features',
        '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second,
                                             mel_bins),
        '{}.h5'.format(data_type))
    create_folder(os.path.dirname(feature_path))

    # Read meta data
    if data_type in ['train_curated', 'train_noisy']:
        meta_dict = read_metadata(metadata_path, lb_to_idx)
    elif data_type == 'test':
        meta_dict = {'audio_name': np.array(sorted(os.listdir(audios_dir)))}

    # Feature extractor
    feature_extractor = LogMelExtractor(sample_rate=sample_rate,
                                        window_size=window_size,
                                        hop_size=hop_size,
                                        mel_bins=mel_bins,
                                        fmin=fmin,
                                        fmax=fmax)

    if mini_data:
        mini_num = 100
        total_num = len(meta_dict['audio_name'])
        random_state = np.random.RandomState(1234)
        indexes = random_state.choice(total_num, size=mini_num, replace=False)
        meta_dict['audio_name'] = meta_dict['audio_name'][indexes]
        if 'target' in meta_dict:
            meta_dict['target'] = meta_dict['target'][indexes]

    # Hdf5 file for storing features and targets
    print('Extracting features of all audio files ...')
    extract_time = time.time()

    audios_num = len(meta_dict['audio_name'])

    hf = h5py.File(feature_path, 'w')

    hf.create_dataset(
        name='audio_name',
        data=[audio_name.encode() for audio_name in meta_dict['audio_name']],
        dtype='S20')

    if 'target' in meta_dict:
        hf.create_dataset(name='target',
                          data=meta_dict['target'],
                          dtype=np.bool)

    hf.create_dataset(name='feature',
                      shape=(0, mel_bins),
                      maxshape=(None, mel_bins),
                      dtype=np.float32)

    hf.create_dataset(name='begin_index', shape=(audios_num, ), dtype=np.int32)

    hf.create_dataset(name='end_index', shape=(audios_num, ), dtype=np.int32)

    for (n, audio_name) in enumerate(meta_dict['audio_name']):
        audio_path = os.path.join(audios_dir, audio_name)
        print(n, audio_path)

        # Read audio
        (audio, _) = read_audio(audio_path=audio_path, target_fs=sample_rate)

        # Extract feature
        feature = feature_extractor.transform(audio)
        print(feature.shape)

        begin_index = hf['feature'].shape[0]
        end_index = begin_index + feature.shape[0]
        hf['feature'].resize((end_index, mel_bins))
        hf['feature'][begin_index:end_index, :] = feature

        hf['begin_index'][n] = begin_index
        hf['end_index'][n] = end_index

    hf.close()

    print('Write hdf5 file to {} using {:.3f} s'.format(
        feature_path,
        time.time() - extract_time))
Пример #6
0
import cv2
import utilities
from utilities import read_metadata
from utilities import update_metadata

# from matplotlib import pyplot as plt


def apply_mask(img, mask):
    return cv2.bitwise_and(img, img, mask=mask)

if __name__ == "__main__":
    if len(sys.argv) > 1:
        img_path = sys.argv[1]
    img = cv2.imread(img_path)
    meta = read_metadata(img_path)

    if "playground_poly" not in meta:
        playground_poly = utilities.select_polygon(img)
        meta = update_metadata(img_path, {"playground_poly": playground_poly})

    playground_mask = utilities.poly2mask(meta["playground_poly"], img)
    ball_circles = meta["ba"]


    # # utilities.show(playground_mask)
    # # utilities.show(cv2.bitwise_not(playground_mask))

    # hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    # hue = hsv[:,:,0]
Пример #7
0
def calculate_feature_for_all_audio_files(args):
    '''Calculate feature of audio files and write out features to a hdf5 file. 
    
    Args:
      dataset_dir: string
      workspace: string
      subtask: 'a' | 'b' | 'c'
      data_type: 'development' | 'evaluation'
      mini_data: bool, set True for debugging on a small part of data
    '''

    # Arguments & parameters
    # dataset_dir = args.dataset_dir
    # workspace = args.workspace
    # subtask = args.subtask
    # data_type = args.data_type
    # mini_data = args.mini_data
    
    dataset_dir = 'D:/Project/DCASE_test/Data'
    workspace = 'D:/Project/DCASE_test'
    subtask = 'a'
    data_type = 'development'
    mini_data = False
    
    sample_rate = config.sample_rate
    window_size = config.window_size
    hop_size = config.hop_size
    mel_bins = config.mel_bins
    fmin = config.fmin
    fmax = config.fmax
    frames_per_second = config.frames_per_second
    frames_num = config.frames_num
    total_samples = config.total_samples
    lb_to_idx = config.lb_to_idx
    mfcc_frames = config.mfcc_frames
    n_mfcc = config.n_mfcc
    mfcc_hop_size = config.mfcc_hop_size
    gamm_frames = config.gamm_frames
    n_gamm = config.n_gamm
    # Paths
    if mini_data:
        prefix = 'minidata_'
    else:
        prefix = ''
        
    sub_dir = get_subdir(subtask, data_type)
    audios_dir = os.path.join(dataset_dir, sub_dir, 'audio')

    if data_type == 'development':
        metadata_path = os.path.join(dataset_dir, sub_dir, 'meta.csv')
    elif data_type == 'leaderboard':
        metadata_path = os.path.join(dataset_dir, sub_dir, 'evaluation_setup', 'test.csv')
    else:
        raise Exception('Incorrect data_type!')
    
    feature_path = os.path.join(workspace, 'features', 
        '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 
        '{}.h5'.format(sub_dir))
    create_folder(os.path.dirname(feature_path))
        
    # Feature extractor
    feature_extractor = LogMelExtractor(
        sample_rate=sample_rate, 
        window_size=window_size, 
        hop_size=hop_size, 
        mel_bins=mel_bins, 
        fmin=fmin, 
        fmax=fmax)

    # Read metadata
    meta_dict = read_metadata(metadata_path)

    # Extract features and targets 
    if mini_data:
        mini_num = 300
        total_num = len(meta_dict['audio_name'])
        random_state = np.random.RandomState(1234)
        indexes = random_state.choice(total_num, size=mini_num, replace=False)
        for key in meta_dict.keys():
            meta_dict[key] = meta_dict[key][indexes]
        
    print('Extracting features of all audio files ...')
    extract_time = time.time()
    
    # Hdf5 file for storing features and targets
    hf = h5py.File(feature_path, 'w')

    hf.create_dataset(
        name='audio_name', 
        data=[audio_name.encode() for audio_name in meta_dict['audio_name']], 
        dtype='S80')

    if 'scene_label' in meta_dict.keys():
        hf.create_dataset(
            name='scene_label', 
            data=[scene_label.encode() for scene_label in meta_dict['scene_label']], 
            dtype='S24')
            
    if 'identifier' in meta_dict.keys():
        hf.create_dataset(
            name='identifier', 
            data=[identifier.encode() for identifier in meta_dict['identifier']], 
            dtype='S24')
            
    if 'source_label' in meta_dict.keys():
        hf.create_dataset(
            name='source_label', 
            data=[source_label.encode() for source_label in meta_dict['source_label']], 
            dtype='S8')

    hf.create_dataset(
        name='feature', 
        shape=(0, total_samples), 
        maxshape=(None, total_samples), 
        dtype=np.float32)
    hf.create_dataset(
        name='feature_gamm', 
        shape=(0, gamm_frames, n_gamm), 
        maxshape=(None, gamm_frames, n_gamm), 
        dtype=np.float32)
    hf.create_dataset(
        name='feature_mfcc', 
        shape=(0, mfcc_frames, n_mfcc), 
        maxshape=(None, mfcc_frames, n_mfcc), 
        dtype=np.float32)
    hf.create_dataset(
        name='feature_panns', 
        shape=(0, 320000), 
        maxshape=(None, 320000), 
        dtype=np.float32)
    
    for (n, audio_name) in enumerate(meta_dict['audio_name']):
        audio_path = os.path.join(audios_dir, audio_name)
        print(n, audio_path)
        
        # Read audio
        (audio, _) = read_audio(
            audio_path=audio_path, 
            target_fs=sample_rate)
        
        audio = pad_truncate_sequence(audio, total_samples)
        
        (audio_gamm, _) = read_audio_gamm(
            audio_path=audio_path, 
            target_fs=sample_rate)
        fea_gamm, _ = gtg_in_dB(audio_gamm, sample_rate) 
        fea_gamm = fea_gamm.transpose(1, 0)
        sound, fs = librosa.load(audio_path)
        fea_mfcc = librosa.feature.mfcc(y=sound, sr=fs, hop_length=mfcc_hop_size, n_mfcc=n_mfcc)
        fea_mfcc = fea_mfcc.transpose(1, 0)
        (waveform, _) = librosa.core.load(audio_path, sr=32000, mono=True)
        
        feature = feature_extractor.transform(audio)
        feature = feature[0 : frames_num]
        
        hf['feature'].resize((n + 1, total_samples))
        hf['feature'][n] = audio        
        hf['feature_gamm'].resize((n + 1, gamm_frames, n_gamm))
        hf['feature_gamm'][n] = fea_gamm
        hf['feature_mfcc'].resize((n + 1, mfcc_frames, n_mfcc))
        hf['feature_mfcc'][n] = fea_mfcc
        hf['feature_panns'].resize((n + 1, 320000))
        hf['feature_panns'][n] = waveform
            
    hf.close()
        
    print('Write hdf5 file to {} using {:.3f} s'.format(
        feature_path, time.time() - extract_time))
Пример #8
0
def calculate_feature_for_all_audio_files(args):
    '''Calculate feature of audio files and write out features to a single hdf5 
    file. 
    
    Args:
      dataset_dir: string
      workspace: string
      data_type: 'development' | 'evaluation'
      mini_data: bool, set True for debugging on a small part of data
    '''

    # Arguments & parameters
    dataset_dir = args.dataset_dir
    workspace = args.workspace
    data_type = args.data_type
    mini_data = args.mini_data

    sample_rate = config.sample_rate
    window_size = config.window_size
    hop_size = config.hop_size
    mel_bins = config.mel_bins
    fmin = config.fmin
    fmax = config.fmax
    frames_per_second = config.frames_per_second
    frames_num = config.frames_num
    total_samples = config.total_samples
    classes_num = config.classes_num
    lb_to_idx = config.lb_to_idx

    # Paths
    if mini_data:
        prefix = 'minidata_'
    else:
        prefix = ''

    relative_name = get_relative_path_no_extension(data_type)
    audios_dir = os.path.join(dataset_dir, 'audio', relative_name)

    if data_type == 'validation':
        metadata_path = os.path.join(dataset_dir, 'metadata', 'validation',
                                     '{}.csv'.format(relative_name))
    else:
        metadata_path = os.path.join(dataset_dir, 'metadata',
                                     '{}.csv'.format(relative_name))

    feature_path = os.path.join(
        workspace, 'features',
        '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second,
                                             mel_bins),
        '{}.h5'.format(relative_name))
    create_folder(os.path.dirname(feature_path))

    # Feature extractor
    feature_extractor = LogMelExtractor(sample_rate=sample_rate,
                                        window_size=window_size,
                                        hop_size=hop_size,
                                        mel_bins=mel_bins,
                                        fmin=fmin,
                                        fmax=fmax)

    # Read metadata
    (data_dict, has_weak_labels,
     has_strong_labels) = read_metadata(metadata_path)

    # Extract features and targets
    audio_names = sorted([*data_dict.keys()])

    if mini_data:
        random_state = np.random.RandomState(1234)
        random_state.shuffle(audio_names)
        audio_names = audio_names[0:10]

    print('Extracting features of all audio files ...')
    extract_time = time.time()

    # Hdf5 file for storing features and targets
    hf = h5py.File(feature_path, 'w')

    hf.create_dataset(name='audio_name',
                      data=[audio_name.encode() for audio_name in audio_names],
                      dtype='S64')

    hf.create_dataset(name='feature',
                      shape=(0, frames_num, mel_bins),
                      maxshape=(None, frames_num, mel_bins),
                      dtype=np.float32)

    if has_weak_labels:
        hf.create_dataset(name='weak_target',
                          shape=(0, classes_num),
                          maxshape=(None, classes_num),
                          dtype=np.bool)

    if has_strong_labels:
        hf.create_dataset(name='strong_target',
                          shape=(0, frames_num, classes_num),
                          maxshape=(None, frames_num, classes_num),
                          dtype=np.bool)

    for (n, audio_name) in enumerate(audio_names):
        audio_path = os.path.join(audios_dir, audio_name)
        print(n, audio_path)

        # Read audio
        (audio, _) = read_audio(audio_path=audio_path, target_fs=sample_rate)

        # Pad or truncate audio recording
        audio = pad_truncate_sequence(audio, total_samples)

        # Extract feature
        feature = feature_extractor.transform(audio)

        # Remove the extra frames caused by padding zero
        feature = feature[0:frames_num]

        hf['feature'].resize((n + 1, frames_num, mel_bins))
        hf['feature'][n] = feature

        if has_weak_labels:
            weak_labels = data_dict[audio_name]['weak_labels']
            hf['weak_target'].resize((n + 1, classes_num))
            hf['weak_target'][n] = labels_to_target(weak_labels, classes_num,
                                                    lb_to_idx)

        if has_strong_labels:
            events = data_dict[audio_name]['strong_labels']
            hf['strong_target'].resize((n + 1, frames_num, classes_num))
            hf['strong_target'][n] = events_to_target(
                events=events,
                frames_num=frames_num,
                classes_num=classes_num,
                frames_per_second=frames_per_second,
                lb_to_idx=lb_to_idx)

    hf.close()

    print('Write hdf5 file to {} using {:.3f} s'.format(
        feature_path,
        time.time() - extract_time))
Пример #9
0
def pack_waveforms_to_hdf5(args):
    """Pack waveform and target of several audio clips to a single hdf5 file. 
    This can speed up loading and training.
    """

    # Arguments & parameters
    audios_dir = args.audios_dir
    csv_path = args.csv_path
    waveforms_hdf5_path = args.waveforms_hdf5_path
    mini_data = args.mini_data

    clip_samples = config.clip_samples
    classes_num = config.classes_num
    sample_rate = config.sample_rate
    id_to_ix = config.id_to_ix

    # Paths
    if mini_data:
        prefix = 'mini_'
        waveforms_hdf5_path += '.mini'
    else:
        prefix = ''

    create_folder(os.path.dirname(waveforms_hdf5_path))

    logs_dir = '_logs/pack_waveforms_to_hdf5/{}{}'.format(
        prefix, get_filename(csv_path))
    create_folder(logs_dir)
    create_logging(logs_dir, filemode='w')
    logging.info('Write logs to {}'.format(logs_dir))

    # Read csv file
    meta_dict = read_metadata(csv_path, classes_num, id_to_ix)

    if mini_data:
        mini_num = 10
        for key in meta_dict.keys():
            meta_dict[key] = meta_dict[key][0:mini_num]

    audios_num = len(meta_dict['audio_name'])

    # Pack waveform to hdf5
    total_time = time.time()

    with h5py.File(waveforms_hdf5_path, 'w') as hf:
        hf.create_dataset('audio_name', shape=((audios_num, )), dtype='S20')
        hf.create_dataset('waveform',
                          shape=((audios_num, clip_samples)),
                          dtype=np.int16)
        hf.create_dataset('target',
                          shape=((audios_num, classes_num)),
                          dtype=np.bool)
        hf.attrs.create('sample_rate', data=sample_rate, dtype=np.int32)

        # Pack waveform & target of several audio clips to a single hdf5 file
        for n in range(audios_num):
            audio_path = os.path.join(audios_dir, meta_dict['audio_name'][n])

            if os.path.isfile(audio_path):
                logging.info('{} {}'.format(n, audio_path))
                (audio, _) = librosa.core.load(audio_path,
                                               sr=sample_rate,
                                               mono=True)
                audio = pad_or_truncate(audio, clip_samples)

                hf['audio_name'][n] = meta_dict['audio_name'][n].encode()
                hf['waveform'][n] = float32_to_int16(audio)
                hf['target'][n] = meta_dict['target'][n]
            else:
                logging.info('{} File does not exist! {}'.format(
                    n, audio_path))

    logging.info('Write to {}'.format(waveforms_hdf5_path))
    logging.info('Pack hdf5 time: {:.3f}'.format(time.time() - total_time))
Пример #10
0
def pack_maestro_dataset_to_hdf5(args):
    """Load & resample MAESTRO audio files, then write to hdf5 files.

    Args:
      dataset_dir: str, directory of dataset
      workspace: str, directory of your workspace
    """

    # Arguments & parameters
    dataset_dir = args.dataset_dir
    workspace = args.workspace

    sample_rate = config.sample_rate

    # Paths
    csv_path = os.path.join(dataset_dir, 'maestro-v2.0.0.csv')
    waveform_hdf5s_dir = os.path.join(workspace, 'hdf5s', 'maestro')

    logs_dir = os.path.join(workspace, 'logs', get_filename(__file__))
    create_logging(logs_dir, filemode='w')
    logging.info(args)

    # Read meta dict
    meta_dict = read_metadata(csv_path)

    audios_num = len(meta_dict['canonical_composer'])
    logging.info('Total audios number: {}'.format(audios_num))

    feature_time = time.time()

    # Load & resample each audio file to a hdf5 file
    for n in range(audios_num):
        logging.info('{} {}'.format(n, meta_dict['midi_filename'][n]))

        # Read midi
        midi_path = os.path.join(dataset_dir, meta_dict['midi_filename'][n])
        midi_dict = read_midi(midi_path)

        # Load audio
        audio_path = os.path.join(dataset_dir, meta_dict['audio_filename'][n])
        (audio, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True)

        packed_hdf5_path = os.path.join(waveform_hdf5s_dir, '{}.h5'.format(
            os.path.splitext(meta_dict['audio_filename'][n])[0]))

        create_folder(os.path.dirname(packed_hdf5_path))

        with h5py.File(packed_hdf5_path, 'w') as hf:
            hf.attrs.create('canonical_composer', data=meta_dict['canonical_composer'][n].encode(), dtype='S100')
            hf.attrs.create('canonical_title', data=meta_dict['canonical_title'][n].encode(), dtype='S100')
            hf.attrs.create('split', data=meta_dict['split'][n].encode(), dtype='S20')
            hf.attrs.create('year', data=meta_dict['year'][n].encode(), dtype='S10')
            hf.attrs.create('midi_filename', data=meta_dict['midi_filename'][n].encode(), dtype='S100')
            hf.attrs.create('audio_filename', data=meta_dict['audio_filename'][n].encode(), dtype='S100')
            hf.attrs.create('duration', data=meta_dict['duration'][n], dtype=np.float32)

            hf.create_dataset(name='midi_event', data=[e.encode() for e in midi_dict['midi_event']], dtype='S100')
            hf.create_dataset(name='midi_event_time', data=midi_dict['midi_event_time'], dtype=np.float32)
            hf.create_dataset(name='waveform', data=float32_to_int16(audio), dtype=np.int16)
        
    logging.info('Write hdf5 to {}'.format(packed_hdf5_path))
    logging.info('Time: {:.3f} s'.format(time.time() - feature_time))