def get_audiosets(cfg): """ get audioset """ # channel size channel_size = 1 if not cfg['feature_params']['use_channels'] else int( cfg['feature_params']['use_cepstral_features']) + int( cfg['feature_params']['use_delta_features']) + int( cfg['feature_params']['use_double_delta_features']) # feature size feature_size = (cfg['feature_params']['n_ceps_coeff'] + int( cfg['feature_params']['use_energy_features'])) * int( cfg['feature_params']['use_cepstral_features']) + ( cfg['feature_params']['n_ceps_coeff'] + int(cfg['feature_params']['use_energy_features']) ) * int(cfg['feature_params']['use_delta_features']) + ( cfg['feature_params']['n_ceps_coeff'] + int(cfg['feature_params']['use_energy_features'])) * int( cfg['feature_params']['use_double_delta_features'] ) if not cfg['feature_params']['use_channels'] else ( cfg['feature_params']['n_ceps_coeff'] + int(cfg['feature_params']['use_energy_features'])) # exception if feature_size == 0 or channel_size == 0: return None, None, None # audio sets audio_set1 = AudioDataset(cfg['datasets']['speech_commands'], cfg['feature_params']) audio_set2 = AudioDataset(cfg['datasets']['my_recordings'], cfg['feature_params']) # create dataset if not existing if not check_files_existance(audio_set1.feature_files): audio_set1 = SpeechCommandsDataset( cfg['datasets']['speech_commands'], feature_params=cfg['feature_params'], verbose=False) audio_set1.extract_features() # create dataset if not existing if not check_files_existance(audio_set2.feature_files): audio_set2 = MyRecordingsDataset(cfg['datasets']['my_recordings'], feature_params=cfg['feature_params'], verbose=False) audio_set2.extract_features() # select feature files all_feature_files = audio_set1.feature_files + audio_set2.feature_files if len( audio_set1.labels) == len( audio_set2.labels) else audio_set1.feature_files return audio_set1, audio_set2, all_feature_files
def test_scalogram_preprocessing(self): cqt_default_dict = { 'sample_rate': 44100, 'fmin': 30, 'n_bins': 292, 'bins_per_octave': 32, 'filter_scale': 0.5, 'hop_length': 256, 'trainable_cqt': False } dataset = '/Volumes/Elements/Datasets/MelodicProgressiveHouse_mp3' dataset = AudioDataset(location=dataset, item_length=44100 * 5) #audio_clip = torch.rand([1, 1, 44100*5]) * 1. - 0.5 audio_clip = dataset[20000][0, :].view(1, 1, -1) prep_module = PreprocessingModule(cqt_default_dict, phase=False, offset_zero=True, output_power=2, pooling=[1, 2]) x = prep_module(audio_clip) x_min = x.min() x_max = x.max() plt.imshow(x[0, 0], origin='lower') plt.show() pass
def get_dataloaders(feat_folder, class_labels, folds, batch_size): data_set = AudioDataset(feat_folder, class_labels, folds) train_dl = DataLoader( data_set, batch_size=batch_size, shuffle=True, num_workers=0 ) return train_dl
def audio_set_wavs(cfg): """ audio set wavs """ # plot path plot_path = '../docu/thesis/5_exp/figs/' # audio sets a1 = AudioDataset(cfg['datasets']['speech_commands'], cfg['feature_params'], root_path='../') a2 = AudioDataset(cfg['datasets']['my_recordings'], cfg['feature_params'], root_path='../') # feature extractor feature_extractor = FeatureExtractor(cfg['feature_params']) # get audio files a1.get_audiofiles() # random seed np.random.seed(1234) r = np.random.randint(low=0, high=150, size=len(a1.set_audio_files[1])) wav_grid = [] # process wavs for wav in sorted([ label_wavs[r[i]] for i, label_wavs in enumerate(a1.set_audio_files[1]) ]): # info print("wav: ", wav) # get raw x, _ = a1.wav_pre_processing(wav) # extract feature vectors [m x l] _, bon_pos = feature_extractor.extract_mfcc(x, reduce_to_best_onset=False) # append to wav grid wav_grid.append((librosa.util.normalize(x), re.sub(r'[0-9]+-', '', wav.split('/')[-1].split('.')[0]), bon_pos)) # plot wav grid plot_wav_grid(wav_grid, feature_params=a1.feature_params, grid_size=(6, 5), plot_path=plot_path, name='wav_grid_c30', show_plot=True)
# Result save path asset_path = config.path['asset_path'] ckpt_path = config.path['ckpt_path'] result_path = config.path['result_path'] restore_epoch = args.restore_epoch experiment_num = str(args.index) ckpt_file_name = 'idx_'+experiment_num+'_%03d.pth.tar' tf_logger = TF_Logger(os.path.join(asset_path, 'tensorboard', 'idx_'+experiment_num)) logger.info("==== Experiment Number : %d " % args.index) if args.pre_model == 'cnn': config.experiment['batch_size'] = 20 # Data loader train_dataset1 = AudioDataset(config, root_dir=config.path['root_path'], dataset_names=(args.dataset1,), num_workers=20, preprocessing=False, train=True, kfold=args.kfold) train_dataset2 = AudioDataset(config, root_dir=config.path['root_path'], dataset_names=(args.dataset2,), num_workers=20, preprocessing=False, train=True, kfold=args.kfold) train_dataset3 = AudioDataset(config, root_dir=config.path['root_path'], dataset_names=(args.dataset3,), num_workers=20, preprocessing=False, train=True, kfold=args.kfold) train_dataset = train_dataset1.__add__(train_dataset2).__add__(train_dataset3) valid_dataset1 = AudioDataset(config, root_dir=config.path['root_path'], dataset_names=(args.dataset1,), preprocessing=False, train=False, kfold=args.kfold) valid_dataset2 = AudioDataset(config, root_dir=config.path['root_path'], dataset_names=(args.dataset2,), preprocessing=False, train=False, kfold=args.kfold) valid_dataset3 = AudioDataset(config, root_dir=config.path['root_path'], dataset_names=(args.dataset3,), preprocessing=False, train=False, kfold=args.kfold) valid_dataset = valid_dataset1.__add__(valid_dataset2).__add__(valid_dataset3) train_dataloader = AudioDataLoader(dataset=train_dataset, batch_size=config.experiment['batch_size'], drop_last=False, shuffle=True) valid_dataloader = AudioDataLoader(dataset=valid_dataset, batch_size=config.experiment['batch_size'], drop_last=False) # Model and Optimizer if args.pre_model == 'cnn': pre_model = CNN(config=config.model).to(device) elif args.pre_model == 'crnn': pre_model = CRNN(config=config.model).to(device)
import yaml from batch_archive import SpeechCommandsBatchArchive from audio_dataset import AudioDataset from plots import plot_grid_images, plot_other_grid # yaml config file cfg = yaml.safe_load(open("./config.yaml")) # change config upon nn arch cfg['feature_params']['use_mfcc_features'] = False if cfg['ml'][ 'nn_arch'] == 'wavenet' else True # audio sets audio_set1 = AudioDataset(cfg['datasets']['speech_commands'], cfg['feature_params']) audio_set2 = AudioDataset(cfg['datasets']['my_recordings'], cfg['feature_params']) # create batches batch_archive = SpeechCommandsBatchArchive(audio_set1.feature_files + audio_set2.feature_files, batch_size=32, batch_size_eval=5) # reduce to label and add noise #batch_archive.reduce_to_label('up') #batch_archive.add_noise_data(shuffle=True) print("data size: ", batch_archive.data_size) print("classes: ", batch_archive.class_dict)
transformation = utils.JointCompose([ utils.JointHorizontalFlip(), utils.JointVerticalFlip(), #utils.JointNormailze(means = [0.485,0.456,0.406],stds = [1,1,1]), #TODO consider use utils.JointToTensor(), ]) val_transformation = utils.JointCompose([ #utils.JointNormailze(means = [0.485,0.456,0.406],stds = [1,1,1]), utils.JointToTensor(), ]) VAL_PART = args.val_part trainset = AudioDataset(data_h5_path='preprocess_audio/data.h5', add_rpm=False, train=True) train_loader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) # trainset = AudioGenDataset("/home/simon/denoise/dataset/mini_dataset/", dataset_size=30, add_rpm=False) # train_loader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) statistic_loader = torch.utils.data.DataLoader(trainset, batch_size=4, shuffle=True, num_workers=args.num_workers) # valset = AudioDataset(data_dir=args.datapath, train=False, validation_part=VAL_PART, validation=True)
checkpoint_file = "/home/tomhirshberg/project/Denoising-drone-rotors/output//2019-05-24_15-27-48/checkpoint.pth.tar" # checkpoint location if os.path.isfile(checkpoint_file): print("loading checkpoint {}".format(checkpoint_file)) checkpoint = torch.load(checkpoint_file, map_location=device) load_model(model, checkpoint) else: print("can't load checkpoint file") exit() # Load dataset datapath = '/home/tomhirshberg/project/Denoising-drone-rotors/preprocess_audio/data.h5' # testset = MSRDemosaic(root=datapath, train=False, transform=val_transformation) # test_loader = torch.utils.data.DataLoader(testset, batch_size=1, shuffle=False, num_workers=num_workers) # use train set to check overfitting testset = AudioDataset(data_h5_path=datapath, add_rpm=False, train=False) test_loader = torch.utils.data.DataLoader(testset, batch_size=1, shuffle=True, num_workers=num_workers) # testset = AudioDataset(data_h5_path=datapath, train=False) # test_loader = torch.utils.data.DataLoader(testset, batch_size=1, shuffle=False, num_workers=num_workers) import IPython.display as ipd from preprocess_audio.preprocess_audio import * # convert back to audio from preprocess_audio.postprocess_audio import * # ipd.Audio('/home/simon/denoise/dataset/audio/file_example_WAV_1MG.wav') # ipd.Audio(x, rate=sr) # load a NumPy array
parser.add_argument('--save_prefix', help='Full path and prefix for saving output models') parser.add_argument('--use_autoencoder', action='store_true') args = parser.parse_args() if args.epochs is None: args.epochs = 5 arch = [(i, j) for i, j in zip(args.arch[:-1], args.arch[1:])] with open(args.fold_config) as f: config = cPickle.load(f) preproc_layer = PreprocLayer(config=config, proc_type='standardize') dataset = TransformerDataset(raw=AudioDataset(which_set='train', config=config), transformer=preproc_layer.layer_content) # transformer_yaml = '''!obj:pylearn2.datasets.transformer_dataset.TransformerDataset { # raw : %(raw)s, # transformer : %(transformer)s # }''' # # dataset_yaml = transformer_yaml % { # 'raw' : '''!obj:audio_dataset.AudioDataset { # which_set : 'train', # config : !pkl: "%(fold_config)s" # }''' % {'fold_config' : args.fold_config}, # 'transformer' : '''!obj:pylearn2.models.mlp.MLP { # nvis : %(nvis)i, # layers :
return self.weights def get_param_values(self): return list((self.get_weights(), self.get_biases())) if __name__=='__main__': # tests import theano import cPickle from audio_dataset import AudioDataset with open('GTZAN_stratified.pkl') as f: config = cPickle.load(f) D = AudioDataset(config) feat_space = VectorSpace(dim=D.X.shape[1]) feat_space_complex = VectorSpace(dim=D.X.shape[1], dtype='complex64') target_space = VectorSpace(dim=len(D.label_list)) data_specs_frame = (CompositeSpace((feat_space,target_space)), ("features", "targets")) data_specs_song = (CompositeSpace((feat_space_complex, target_space)), ("songlevel-features", "targets")) framelevel_it = D.iterator(mode='sequential', batch_size=10, data_specs=data_specs_frame) frame_batch = framelevel_it.next() songlevel_it = D.iterator(mode='sequential', batch_size=1, data_specs=data_specs_song) song_batch = songlevel_it.next()
def main(): config_filename = Path.cwd().joinpath(CONFIGS_DIR).joinpath( CONFIG_FILENAME) config = Configuration(config_filename) batch_size = 4 epochs = 1 results_dir_path = Path.cwd().joinpath(RESULTS_DIR) current_run_path = create_results_directories(results_dir_path) transforms = TransformsComposer([Rescale(output_size=10000), ToTensor()]) encoder = LabelEncoder() data_loader = DataLoader(config) x_train, y_train = data_loader.get_train_set() encoder.fit(y_train) classes = encoder.classes_ classes_map = {} for i, category in enumerate(classes): classes_map[i] = category print(classes_map) y_train = encoder.transform(y_train) train_dataset = AudioDataset(x_train, y_train, transforms) x_test, y_test = data_loader.get_test_set() y_test = encoder.transform(y_test) test_dataset = AudioDataset(x_test, y_test, transforms) model = M5(num_classes=len(classes_map)) states_dir = Path.cwd().joinpath(STATES_DIR) state_filename = f'{uuid.uuid1()}_state_{epochs}_epochs.pth' state_path = current_run_path.joinpath('best_snapshot').joinpath( state_filename) classifier = Classifier(model=model, state_path=state_path) # Fit model on data train_loss_history, val_loss_history = classifier.fit( train_dataset, batch_size=batch_size, epochs=epochs, validation_data=test_dataset) # plt.figure() # plt.title(f'Model Loss for {epochs} epochs') # plt.xlabel('epoch') # plt.ylabel('loss') # plt.plot(train_loss_history, label='train') # plt.plot(val_loss_history, label='test') # plt.legend() # plt.show() predictions_path = current_run_path.joinpath('./predicted.csv') validation_dataset = AudioDataset(x_test, y_test, transforms) validation_model = M5(num_classes=len(classes_map)) validation_classifier = Classifier(validation_model, state_path=state_path) validation_classifier.predict(validation_dataset, batch_size=batch_size, output_filepath=predictions_path, classes=classes_map)
now_tuple[0] % 100, now_tuple[1], now_tuple[2], now_tuple[3], now_tuple[4]) ckpt_file_name = date_info + '_%03d.pth.tar' #tf_logger = TF_Logger(os.path.join(asset_path, 'tensorboard', 'idx_'+experiment_num)) subdir = 'from_json' if args.from_json else 'from_wav' writer = SummaryWriter( log_dir=(os.path.join(asset_path, 'tensorboard', subdir, date_info))) logger.info("==== Experiment Number : %d " % args.index) if args.model == 'cnn': config.experiment['batch_size'] = 10 # Data loader train_dataset1 = AudioDataset(config, root_dir=config.path['root_path'], dataset_names=(args.dataset1, ), num_workers=20, from_json=args.from_json, preprocessing=False, train=True, kfold=args.kfold) # train_dataset2 = AudioDataset(config, root_dir=config.path['root_path'], dataset_names=(args.dataset2,), num_workers=20, preprocessing=False, train=True, kfold=args.kfold) # train_dataset3 = AudioDataset(config, root_dir=config.path['root_path'], dataset_names=(args.dataset3,), num_workers=20, preprocessing=False, train=True, kfold=args.kfold) # train_dataset = train_dataset1.__add__(train_dataset2).__add__(train_dataset3) valid_dataset1 = AudioDataset(config, root_dir=config.path['root_path'], dataset_names=(args.dataset1, ), from_json=args.from_json, preprocessing=False, train=False, kfold=args.kfold) # valid_dataset2 = AudioDataset(config, root_dir=config.path['root_path'], dataset_names=(args.dataset2,), preprocessing=False, train=False, kfold=args.kfold) # valid_dataset3 = AudioDataset(config, root_dir=config.path['root_path'], dataset_names=(args.dataset3,), preprocessing=False, train=False, kfold=args.kfold)
lambda y: np.sqrt(y[:y.shape[0] / 2, :]**2 + y[y.shape[0] / 2:, :]**2)) mask_value = -1. else: transform_y = (lambda y: y) mask_value = 0. # load the data #################### maxlen = None maxlen = 500 print "Loading data..." # development data D_valid = AudioDataset(config['taskfile_x_valid'], config['taskfile_y_valid'], datafile=config['datafile_valid'], params_stft=config['params_stft']) #print " Loading validation data..." #x_valid, y_valid, mask_valid = D_valid.get_padded_data_matrix(transform_x=transform_x, transform_y=transform_y, pad_value=mask_value, maxlen=maxlen) for i in range(10): x = util.wavread(D_valid.x_wavfiles[i])[0:1, :] xr = D_valid.reconstruct_x(i)[0:1, :] if xr.shape[1] > x.shape[1]: xr = xr[:, :x.shape[1]] print "For file %d, NMSE between original x and reconstructed x is %e" % ( i, np.mean((x - xr)**2) / np.mean(x**2)) y = util.wavread(D_valid.y_wavfiles[i])[0:1, :] yr = D_valid.reconstruct_y(i)
parser.add_argument('--which_set', help='train, test, or valid') parser.add_argument('--save_file', help='Save results to tab separated file') args = parser.parse_args() # get model model = serial.load(args.model_file) if args.which_set is None: args.which_set = 'test' if args.testset: # dataset config passed in from command line print 'Using dataset passed in from command line' with open(args.testset) as f: config = cPickle.load(f) dataset = AudioDataset(config=config, which_set=args.which_set) # get model dataset for its labels... model_dataset = yaml_parse.load(model.dataset_yaml_src) label_list = model_dataset.label_list else: # get dataset from model's yaml_src print "Using dataset from model's yaml src" p = re.compile(r"which_set.*'(train)'") dataset_yaml = p.sub("which_set: '{}'".format(args.which_set), model.dataset_yaml_src) dataset = yaml_parse.load(dataset_yaml) label_list = dataset.label_list # measure test error