def evaluate(model, batch, device): metrics = defaultdict(list) batch = allocate_batch(batch, device) frame_logit, onset_logit = model(batch['audio']) criterion = nn.BCEWithLogitsLoss() frame_loss = criterion(frame_logit, batch['frame']) onset_loss = criterion(frame_logit, batch['onset']) metrics['metric/loss/frame_loss'].append(frame_loss.cpu().numpy()) metrics['metric/loss/onset_loss'].append(onset_loss.cpu().numpy()) for n in range(batch['audio'].shape[0]): frame_pred = th.sigmoid(frame_logit[n]) onset_pred = th.sigmoid(onset_logit[n]) pr, re, f1 = framewise_eval(frame_pred, batch['frame'][n]) metrics['metric/frame/frame_precision'].append(pr) metrics['metric/frame/frame_recall'].append(re) metrics['metric/frame/frame_f1'].append(f1) pr, re, f1 = framewise_eval(onset_pred, batch['onset'][n]) metrics['metric/frame/onset_precision'].append(pr) metrics['metric/frame/onset_recall'].append(re) metrics['metric/frame/onset_f1'].append(f1) p_est, i_est = extract_notes(onset_pred, frame_pred) p_ref, i_ref = extract_notes(batch['onset'][n], batch['frame'][n]) scaling = HOP_SIZE / SAMPLE_RATE i_ref = (i_ref * scaling).reshape(-1, 2) p_ref = np.array([midi_to_hz(MIN_MIDI + pitch) for pitch in p_ref]) i_est = (i_est * scaling).reshape(-1, 2) p_est = np.array([midi_to_hz(MIN_MIDI + pitch) for pitch in p_est]) p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est, offset_ratio=None) metrics['metric/note/precision'].append(p) metrics['metric/note/recall'].append(r) metrics['metric/note/f1'].append(f) metrics['metric/note/overlap'].append(o) p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est) metrics['metric/note-with-offsets/precision'].append(p) metrics['metric/note-with-offsets/recall'].append(r) metrics['metric/note-with-offsets/f1'].append(f) metrics['metric/note-with-offsets/overlap'].append(o) return metrics
def train(model_type, logdir, batch_size, iterations, validation_interval, sequence_length, learning_rate, weight_decay, cnn_unit, fc_unit, debug=False, save_midi=False): if logdir is None: logdir = Path('runs') / ('exp_' + datetime.now().strftime('%y%m%d-%H%M%S')) Path(logdir).mkdir(parents=True, exist_ok=True) if sequence_length % HOP_SIZE != 0: adj_length = sequence_length // HOP_SIZE * HOP_SIZE print( f'sequence_length: {sequence_length} is not divide by {HOP_SIZE}.\n \ adjusted into : {adj_length}') sequence_length = adj_length if debug: dataset = MAESTRO_small(groups=['debug'], sequence_length=sequence_length, hop_size=HOP_SIZE, random_sample=True) valid_dataset = dataset iterations = 100 validation_interval = 10 else: dataset = MAESTRO_small(groups=['train'], sequence_length=sequence_length, hop_size=HOP_SIZE, random_sample=True) valid_dataset = MAESTRO_small(groups=['validation'], sequence_length=sequence_length, hop_size=HOP_SIZE, random_sample=False) loader = DataLoader(dataset, batch_size, shuffle=True) device = th.device('cuda') if th.cuda.is_available() else th.device('cpu') if model_type == 'baseline': model = Transcriber(cnn_unit=cnn_unit, fc_unit=fc_unit) elif model_type == 'rnn': model = Transcriber_RNN(cnn_unit=cnn_unit, fc_unit=fc_unit) elif model_type == 'crnn': model = Transcriber_CRNN(cnn_unit=cnn_unit, fc_unit=fc_unit) elif model_type == 'ONF': model = Transcriber_ONF(cnn_unit=cnn_unit, fc_unit=fc_unit) optimizer = th.optim.Adam(model.parameters(), learning_rate, weight_decay=weight_decay) scheduler = StepLR(optimizer, step_size=1000, gamma=0.98) criterion = nn.BCEWithLogitsLoss() model = model.to(device) loop = tqdm(range(1, iterations + 1)) for step, batch in zip(loop, cycle(loader)): optimizer.zero_grad() batch = allocate_batch(batch, device) frame_logit, onset_logit = model(batch['audio']) frame_loss = criterion(frame_logit, batch['frame']) onset_loss = criterion(onset_logit, batch['onset']) loss = onset_loss + frame_loss loss.mean().backward() for parameter in model.parameters(): clip_grad_norm_([parameter], 3.0) optimizer.step() scheduler.step() loop.set_postfix_str("loss: {:.3e}".format(loss.mean())) if step % validation_interval == 0: model.eval() with th.no_grad(): loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False) metrics = defaultdict(list) for batch in loader: batch_results = evaluate(model, batch, device) for key, value in batch_results.items(): metrics[key].extend(value) print('') for key, value in metrics.items(): if key[-2:] == 'f1' or 'loss' in key: print(f'{key:27} : {np.mean(value):.4f}') model.train() th.save( { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'step': step, 'cnn_unit': cnn_unit, 'fc_unit': fc_unit }, Path(logdir) / f'model-{step}.pt') del dataset, valid_dataset test_dataset = MAESTRO_small(groups=['test'], hop_size=HOP_SIZE, random_sample=False) model.eval() with th.no_grad(): loader = DataLoader(test_dataset, batch_size=1, shuffle=False) metrics = defaultdict(list) for batch in loader: batch_results = evaluate(model, batch, device, save=save_midi, save_path=logdir) for key, value in batch_results.items(): metrics[key].extend(value) print('') for key, value in metrics.items(): if key[-2:] == 'f1' or 'loss' in key: print(f'{key} : {np.mean(value)}') with open(Path(logdir) / 'results.txt', 'w') as f: for key, values in metrics.items(): _, category, name = key.split('/') metric_string = f'{category:>32} {name:26}: {np.mean(values):.3f} +- {np.std(values):.3f}' print(metric_string) f.write(metric_string + '\n')
def evaluate(model, batch, device, save=False, save_path=None): metrics = defaultdict(list) batch = allocate_batch(batch, device) frame_logit, onset_logit = model(batch['audio']) criterion = nn.BCEWithLogitsLoss() frame_loss = criterion(frame_logit, batch['frame']) onset_loss = criterion(frame_logit, batch['onset']) metrics['metric/loss/frame_loss'].append(frame_loss.cpu().numpy()) metrics['metric/loss/onset_loss'].append(onset_loss.cpu().numpy()) for n in range(batch['audio'].shape[0]): frame_pred = th.sigmoid(frame_logit[n]) onset_pred = th.sigmoid(onset_logit[n]) pr, re, f1 = framewise_eval(frame_pred, batch['frame'][n]) metrics['metric/frame/frame_precision'].append(pr) metrics['metric/frame/frame_recall'].append(re) metrics['metric/frame/frame_f1'].append(f1) pr, re, f1 = framewise_eval(onset_pred, batch['onset'][n]) metrics['metric/frame/onset_precision'].append(pr) metrics['metric/frame/onset_recall'].append(re) metrics['metric/frame/onset_f1'].append(f1) p_est, i_est = extract_notes(onset_pred, frame_pred) p_ref, i_ref = extract_notes(batch['onset'][n], batch['frame'][n]) scaling = HOP_SIZE / SAMPLE_RATE i_ref = (i_ref * scaling).reshape(-1, 2) p_ref = np.array([midi_to_hz(MIN_MIDI + pitch) for pitch in p_ref]) i_est = (i_est * scaling).reshape(-1, 2) p_est = np.array([midi_to_hz(MIN_MIDI + pitch) for pitch in p_est]) p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est, offset_ratio=None) metrics['metric/note/precision'].append(p) metrics['metric/note/recall'].append(r) metrics['metric/note/f1'].append(f) metrics['metric/note/overlap'].append(o) p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est) metrics['metric/note-with-offsets/precision'].append(p) metrics['metric/note-with-offsets/recall'].append(r) metrics['metric/note-with-offsets/f1'].append(f) metrics['metric/note-with-offsets/overlap'].append(o) if save: if len(p_est) == 0: print( f'no onset detected. skip: {Path(batch["path"][n]).stem}') midi_filename = Path(save_path) / (Path(batch['path'][n]).stem + '.midi') save_midi(midi_filename, p_est, i_est, [64] * len(p_est)) wav_filename = Path(save_path) / (Path(batch['path'][n]).stem + '.wav') midi_file = pretty_midi.PrettyMIDI(str(midi_filename)) synth_audio = midi_file.fluidsynth(fs=16000) soundfile.write(wav_filename, synth_audio, 16000) return metrics
def train(model_type, logdir, batch_size, iterations, validation_interval, sequence_length, learning_rate, weight_decay, cnn_unit, fc_unit, debug=False, save_midi=False, n_train=1): # Set the log directory if logdir is None: logdir = Path('runs') / ('exp_' + datetime.now().strftime('%y%m%d-%H%M%S') + '_' + model_type) Path(logdir).mkdir(parents=True, exist_ok=True) # Make sequence length as the multiples of HOP_SIZE -> why? if sequence_length % HOP_SIZE != 0: adj_length = sequence_length // HOP_SIZE * HOP_SIZE print( f'sequence_length: {sequence_length} is not divide by {HOP_SIZE}.\n \ adjusted into : {adj_length}') sequence_length = adj_length # Dataset setting if debug: dataset = MAESTRO_small(groups=['debug'], sequence_length=sequence_length, hop_size=HOP_SIZE, random_sample=True) valid_dataset = dataset iterations = 100 validation_interval = 10 else: dataset = MAESTRO_small(groups=['train'], sequence_length=sequence_length, hop_size=HOP_SIZE, random_sample=True) valid_dataset = MAESTRO_small(groups=['validation'], sequence_length=sequence_length, hop_size=HOP_SIZE, random_sample=False) loader = DataLoader(dataset, batch_size, shuffle=True) # Device setting device = th.device('cuda') if th.cuda.is_available() else th.device('cpu') print(th.cuda.device_count(), th.cuda.current_device()) # Model setting if model_type == 'baseline': model = Transcriber(cnn_unit=cnn_unit, fc_unit=fc_unit) elif model_type == 'rnn': model = Transcriber_RNN(cnn_unit=cnn_unit, fc_unit=fc_unit) elif model_type == 'crnn': model = Transcriber_CRNN(cnn_unit=cnn_unit, fc_unit=fc_unit) elif model_type == 'ONF': model = Transcriber_ONF(cnn_unit=cnn_unit, fc_unit=fc_unit) elif model_type == 'udrnn': model = Transcriber_udRNN(cnn_unit=cnn_unit, fc_unit=fc_unit) optimizer = th.optim.Adam(model.parameters(), learning_rate, weight_decay=weight_decay) scheduler = StepLR(optimizer, step_size=1000, gamma=0.98) criterion = nn.BCEWithLogitsLoss() model = model.to(device) # Training : why not using batch enumerate and using custom cycle function loop = tqdm(range(1, iterations + 1)) try: for step, batch in zip(loop, cycle(loader)): optimizer.zero_grad() batch = allocate_batch(batch, device) # oh this is useful # Feed the input to model(audio -> frame and onset logit : just a classification) frame_logit, onset_logit = model(batch['audio']) frame_loss = criterion(frame_logit, batch['frame']) onset_loss = criterion(onset_logit, batch['onset']) loss = onset_loss + frame_loss loss.mean().backward() # What clip_grad_norm does? for parameter in model.parameters(): clip_grad_norm_([parameter], 3.0) optimizer.step() scheduler.step() loop.set_postfix_str("loss: {:.3e}".format(loss.mean())) if step % validation_interval == 0: model.eval() with th.no_grad(): loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False) metrics = defaultdict(list) for batch in loader: batch_results = evaluate(model, batch, device) for key, value in batch_results.items(): metrics[key].extend(value) print('') with open(Path(logdir) / 'results.txt', 'a+') as f: for key, value in metrics.items(): if key[-2:] == 'f1' or 'loss' in key: eval_string = f'{key:27} : {np.mean(value):.4f}' print(eval_string) f.write(eval_string + '\n') f.write('\n') model.train() except KeyboardInterrupt: # ctrl + C early stopping with open(Path(logdir) / 'results.txt', 'a+') as f: dashes = '-' * 100 print(dashes) f.write(dashes + '\n') early_log = 'Exiting from training early' print(early_log) f.write(early_log + '\n') # Save the results and delete dataset th.save( { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'step': step, 'cnn_unit': cnn_unit, 'fc_unit': fc_unit }, Path(logdir) / f'model-{step}.pt') del dataset, valid_dataset test_dataset = MAESTRO_small(groups=['test'], hop_size=HOP_SIZE, random_sample=False) model.eval() with th.no_grad(): loader = DataLoader(test_dataset, batch_size=1, shuffle=False) metrics = defaultdict(list) for batch in loader: batch_results = evaluate(model, batch, device, save=save_midi, save_path=logdir) for key, value in batch_results.items(): metrics[key].extend(value) print('') for key, value in metrics.items(): if key[-2:] == 'f1' or 'loss' in key: print(f'{key} : {np.mean(value)}') with open(Path(logdir) / 'results.txt', 'a+') as f: for key, values in metrics.items(): _, category, name = key.split('/') metric_string = f'{category:>32} {name:26}: {np.mean(values):.3f} +- {np.std(values):.3f}' print(metric_string) f.write(metric_string + '\n')