def load_references_and_mix(mixture_folder, spk_directories, mix_file): references = {} mixture, sr = utils.load_audio(os.path.join(mixture_folder, 'mix', mix_file)) for spk_directory in spk_directories: reference = os.path.join(mixture_folder, spk_directory, mix_file) references[spk_directory.split('/')[-1]] = utils.load_audio(reference)[0][0] return mixture, references, sr
def __getitem__(self, index): # select the target based on the dataset index target_track_path = self.tracks[index]['path'] if self.random_chunks: target_min_duration = self.tracks[index]['min_duration'] target_start = random.uniform( 0, target_min_duration - self.seq_duration ) else: target_start = 0 # optionally select a random interferer track if self.random_interferer_mix: random_idx = random.choice(range(len(self.tracks))) intfr_track_path = self.tracks[random_idx]['path'] if self.random_chunks: intfr_min_duration = self.tracks[random_idx]['min_duration'] intfr_start = random.uniform( 0, intfr_min_duration - self.seq_duration ) else: intfr_start = 0 else: intfr_track_path = target_track_path intfr_start = target_start # get sources from interferer track sources = list(intfr_track_path.glob('*' + self.ext)) # load sources x = 0 for source_path in sources: # skip target file and load it later if source_path == intfr_track_path / self.target_file: continue try: audio = load_audio( source_path, start=intfr_start, dur=self.seq_duration ) except RuntimeError: index = index - 1 if index > 0 else index + 1 return self.__getitem__(index) x += self.source_augmentations(audio) # load the selected track target if Path(target_track_path / self.target_file).exists(): y = load_audio( target_track_path / self.target_file, start=target_start, dur=self.seq_duration ) y = self.source_augmentations(y) x += y # Use silence if target does not exist else: y = torch.zeros(audio.shape) return x, y
def __getitem__(self, index): input_path, output_path = self.tuple_paths[index] if (self.seq_duration != 0.0): if self.random_chunks: input_info = load_info(input_path) output_info = load_info(output_path) duration = min(input_info['duration'], output_info['duration']) start = random.uniform(0, duration - self.seq_duration) else: start = 0 #print("DATA", start) X_audio = load_audio(input_path, start=start, dur=self.seq_duration) Y_audio = load_audio(output_path, start=start, dur=self.seq_duration) else: input_info = load_info(input_path) output_info = load_info(output_path) start = 0 duration = min(input_info['duration'], output_info['duration']) X_audio = load_audio(input_path, start=start, dur=duration) Y_audio = load_audio(output_path, start=start, dur=duration) # return torch tensors return X_audio, Y_audio
def align_audio(payload: PayLoad): try: load_audio(payload.bucket_id, payload.sub_dir, payload.file_name) except ClientError: raise HTTPException(status_code=404, detail="Item not found") prepare_text(payload.text) sync_map = force_align() clean_dir() response = Response(alignment=sync_map, file_name=payload.file_name) return response
def generate_samples(samples_path, label_path, batch_count): ''' Generator function that loads wavs converts to spectrograms and finds labels to return as well Batch count is number of to return in each generator batch ''' sample_count = 0 X_train = None Y_train = None while(True): file = random.choice(os.listdir(samples_path)) wav_path = os.path.join(samples_path, file) signal, sr = utils.load_audio(wav_path, mono=True) melgram = utils.make_melgram(signal, sr) file_name = file.split('.')[0] label = np.load(os.path.join(label_path,file_name+".npy")) if(X_train is None): #if first in sequence X_train = np.zeros((batch_count, melgram.shape[1], melgram.shape[2], melgram.shape[3])) Y_train = np.zeros((batch_count, label.shape[0])) X_train[0] = melgram[0] Y_train[0] = label else: X_train[sample_count] = melgram[0] Y_train[sample_count] = label if sample_count == batch_count-1: sample_count = 0 yield X_train, Y_train else: sample_count += 1
def separate_signals(method: str, model_filename: pathlib.Path, data_conf: dict, midi_note_nums: numpy.ndarray, spectrogram: numpy.ndarray, facwt: pyfacwt.FACWT, facwt_params: dict, decimation_factor: int, save_waveform: bool=False, gpu: int=-1, overwrite: bool=False) -> numpy.ndarray: """Separate signals Args: method (str): Method model_filename (pathlib.Path): Model filename data_conf (dict): Input data configuration midi_note_nums (numpy.ndarray): Midi note numbers of pitches spectrogram (numpy.ndarray): Observed complex spectrogram facwt (pyfacwt.FACWT): Fast approximate CWT instance facwt_params (dict): FACWT parameters decimation_factor (int): Decimation factor save_waveform (bool, optional): If True, dump separated signals. Defaults to False. gpu (int, optional): Gpu number. Defaults to -1. overwrite (bool, optional): If True, overwrite separated signals. Defaults to False. Returns: numpy.ndarray: Separated signals """ # load model model = joblib.load(model_filename) # separate preparation valid_k_list = list(filter(lambda k: midi_note_nums[k] in data_conf["gt"].keys(), list(range(model.n_bases)))) logger.info("valid k list: {} {}".format(valid_k_list, data_conf["gt"].keys())) if gpu >= 0: xp = cupy model.to_gpu() else: xp = numpy X = None facwt.verbose = 0 # suppress output # separated_signals = {} for k in tqdm(valid_k_list, leave=True, desc=' {0: >10s}'.format('Valid basis ')): outfname = model_filename.parent / f"pitch{midi_note_nums[k]:03d}.wav" if not outfname.exists() or overwrite: if X is None: X = model.reconstruct() X[:] = xp.maximum(model.eps, X) X = X.astype('f') X_k = xp.maximum(model.eps, model.reconstruct(k_list=[k])).astype('f') weight = (X_k / X).astype('f') if xp == cupy: weight = cupy.asnumpy(weight) # Interpolate masks at decimated frames if decimation_factor > 1: interpfun = interp1d(numpy.arange(0, weight.shape[1]) * decimation_factor, weight, kind="linear", axis=1, bounds_error=False, fill_value="extrapolate") weight = interpfun(numpy.arange(0, spectrogram.shape[1])).astype('f') # Squash weights into [0,1] weight[weight < 0] = 0 weight[weight > 1] = 1 # masking Y_k = spectrogram * weight # Convert into time-domain signal separated_signals[midi_note_nums[k]] = spectrogram2signal(outfname, list(Y_k), facwt, save=save_waveform) else: separated_signals[midi_note_nums[k]] = load_audio(outfname)[0] return separated_signals
def __getitem__(self, idx): phn_file = self.files[idx] wav_file = self.files[idx][:-3] + "WAV.wav" labels = get_phn(phn_file, self.tokenizer) labels = self.pad(labels, pad_value=self.tokenizer.convert_token("[NULL]")) audio = load_audio(wav_file) audio = self.pad(audio) random_index = random.randint(0, len(audio) - self.audio_length) labels, audio = labels[random_index:random_index + self. audio_length], audio[random_index:random_index + self.audio_length] audio = self.normalize_audio(audio) if random.choice([0, 1]) == 1: labels, audio = self.add_silence(labels, audio) audio = self.add_noise(audio) assert audio.shape[0] == self.audio_length assert labels.shape[0] == self.audio_length return torch.FloatTensor(audio).unsqueeze(0), torch.LongTensor(labels)
def __getitem__(self, index): # for validation, get deterministic behavior # by using the index as seed if self.split == 'valid': random.seed(index) # For each source draw a random sound and mix them together audio_sources = [] for source in self.source_folders: # select a random track for each source source_path = random.choice(self.source_tracks[source]) if self.random_chunks: duration = load_info(source_path)['duration'] start = random.uniform(0, duration - self.seq_duration) else: start = 0 audio = load_audio(source_path, start=start, dur=self.seq_duration) audio = self.source_augmentations(audio) audio_sources.append(audio) stems = torch.stack(audio_sources) # # apply linear mix over source index=0 x = stems.sum(0) # target is always the last element in the list y = stems[-1] return x, y
def __getitem__(self, index): track_path = self.tracks[index]['path'] min_duration = self.tracks[index]['min_duration'] sources = list(track_path.glob('*' + self.ext)) if self.random_chunks: start = random.uniform(0, min_duration - self.seq_duration) else: start = 0 # load sources audio_sources = [] for source_path in sources: try: audio = load_audio(source_path, start=start, dur=self.seq_duration) except RuntimeError: index = index - 1 if index > 0 else index + 1 return self.__getitem__(index) audio = self.source_augmentations(audio) audio_sources.append(audio) stems = torch.stack(audio_sources, dim=0) # # apply linear mix over source index=0 x = stems.sum(0) # target is always the last element in the list if track_path / self.target_file in sources: y = stems[sources.index(track_path / self.target_file)] else: y = torch.zeros(x.shape) return x, y
def add_noise(self, audio): noise_audio = load_audio(random.choice(self.noise_files)) noise_audio = self.random_loudness(noise_audio) random_index = random.randint(0, len(noise_audio) - self.audio_length) return audio + noise_audio[random_index:random_index + self.audio_length]
def read_f0(ref_dir): paths = sorted(glob.glob(os.path.join(ref_dir, '*.wav'))) f0_lst = [] for path in paths: wav, sr = utils.load_audio(path) f0 = utils.get_f0(wav, sr, fmin=60, fmax=400) f0_lst.append(f0) return f0_lst
def load_audio_files(self, wav_file): sources = [] channel_indices = np.arange(self.channels_in_mix) np.random.shuffle(channel_indices) channel_indices = channel_indices[:self.num_channels] for speaker in self.speaker_folders: speaker_path = os.path.join(self.folder, speaker, wav_file) mix_path = os.path.join(self.folder, 'mix', wav_file) mix, _ = utils.load_audio(mix_path) source, _ = utils.load_audio(speaker_path) mix = mix[channel_indices] source = source[channel_indices] sources.append(source) return mix, sources, np.eye(self.num_speakers)
def job(fpath): wav_path = os.path.join(args.data_path, 'wavs', fpath.replace('npy', 'wav')) wav, sr = utils.load_audio(wav_path) mel = utils.get_mel_spectrogram(wav, sr) # ga = prepro_guided_attention(len(text), len(mel), g=args.g) f0 = utils.get_f0(wav, sr, fmin=60, fmax=400, spec_len=mel.shape[0]) np.save(os.path.join(args.data_path, args.mel_dir, fpath), mel) np.save(os.path.join(args.data_path, args.f0_dir, fpath), f0) return None
def load_run_experiment_and_save(filename): clf = SVC(C=1, gamma=0.001, kernel='rbf', random_state=0) audio = load_audio(filename) evolution = refit_from_best(clf, audio) exp_filename = 'data/experiments/' + filename.split('/')[-1] exp_filename = exp_filename.replace('.wav', '.yaml') save_yaml(exp_filename, evolution) return evolution
def prepare_evaluate(conf: dict): '''Prepare for evaluation Args: conf (dict): Configuration of a mixture Returns: tuple[list,numpy.ndarray,numpy.ndarray]: Groundtruth pitches, input SDRs, and groundtruth signals (# of pitches x signal length) ''' gt_list = conf["gt"] gt_pitches = sorted([int(p) for p in gt_list.keys()]) refs = [load_audio(gt_list[p]) for p in gt_pitches] refs = numpy.stack(refs, axis=0) # n_pitches x sig_len # load mixed mixed = load_audio(conf["mix"]) # compute input sisdr sdrs = compute_bsseval_v2( refs, numpy.tile(mixed[None, :refs.shape[1]] / refs.shape[0], (refs.shape[0], 1))) return gt_pitches, sdrs, refs
def load_test_data(model_path, instrument, fx, param_id): test_df = pd.read_csv(os.path.join(model_path, 'test_data.csv')) n_total_clips = test_df.shape[0] input_target_pairs = [0] * n_total_clips if fx is meta.FXCHAIN: dry_path = meta.params_path[meta.FXCHAIN][instrument][meta.NO_FX] wet_path = meta.params_path[meta.FXCHAIN][instrument][param_id] else: dry_path = meta.params_path[instrument][meta.NO_FX] wet_path = meta.params_path[instrument][fx] for idx, row in test_df.iterrows(): audio_in = load_audio(os.path.join(dry_path, row['input_file']), idx, n_total_clips, meta.NO_FX) audio_target = load_audio(os.path.join(wet_path, row['target_file']), idx, n_total_clips, fx) input_target_pairs[idx] = (audio_in, audio_target) return input_target_pairs
def __getitem__(self, index): # first, get target track track_path = self.tracks[index]['path'] min_duration = self.tracks[index]['min_duration'] if self.random_chunks: # determine start seek by target duration start = random.uniform(0, min_duration - self.seq_duration) else: start = 0 # assemble the mixture of target and interferers audio_sources = [] # load target target_audio = load_audio(track_path / self.target_file, start=start, dur=self.seq_duration) target_audio = self.source_augmentations(target_audio) audio_sources.append(target_audio) # load interferers for source in self.interferer_files: # optionally select a random track for each source if self.random_track_mix: random_idx = random.choice(range(len(self.tracks))) track_path = self.tracks[random_idx]['path'] if self.random_chunks: min_duration = self.tracks[random_idx]['min_duration'] start = random.uniform(0, min_duration - self.seq_duration) audio = load_audio(track_path / source, start=start, dur=self.seq_duration) audio = self.source_augmentations(audio) audio_sources.append(audio) stems = torch.stack(audio_sources) # # apply linear mix over source index=0 x = stems.sum(0) # target is always the first element in the list y = stems[0] return x, y
def main(_): if model_type == "IMAGE": features, labels = utils.load_images(args.input, args.batch_size) elif model_type == "AUDIO": features, labels = utils.load_audio(args.input, args.batch_size) elif model_type == "SPECT": features, labels = utils.load_spect(args.input, args.batch_size) print(features) ripeness_classifier = tf.estimator.Estimator(model_fn=model, model_dir=args.dir) #Set up loging for predictions tensors_to_log = {"probabilites": "Predictions/softmax_tensor"} logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=args.log_steps) #Train the model if args.mode == "TRAIN": input_fn = tf.estimator.inputs.numpy_input_fn( x=features, y=labels, batch_size=args.batch_size, num_epochs=None, shuffle=True) ripeness_classifier.train(input_fn=input_fn, steps=args.steps, hooks=[logging_hook]) print("Training completed") elif args.mode == "EVAL": input_fn = tf.estimator.inputs.numpy_input_fn( x=features, y=labels, batch_size=args.batch_size, num_epochs=1, shuffle=False) eval_results = ripeness_classifier.evaluate(input_fn=input_fn) print("Eval results:") print(eval_results) elif args.mode == "PREDICT": input_fn = tf.estimator.inputs.numpy_input_fn( x=features, batch_size=args.batch_size, num_epochs=1, shuffle=False) pred = list(est.predict(pred_input_fn)) print("Prediction results:") print(pred)
def load_data(): features = [] for file in os.listdir(data_set): if file.endswith(".wav") and ( "Al" in file or 'Ar' in file or 'Pr' in file or 'Pl' in file) and 'COPD' in file and 'AKGC417L' in file: class_label = utils.class_name(file) data_file = os.path.join(data_set, file) audio, sample_rate = utils.load_audio(data_file) raw_data = utils.extract_features(audio, sample_rate) features = utils.append_features(features, class_label, raw_data) featuresdf = pd.DataFrame(features, columns=['feature', 'class_label']) x = np.array(featuresdf.feature.tolist()) y = np.array(featuresdf.class_label.tolist()) le = LabelEncoder() yy = to_categorical(le.fit_transform(y)) return x, yy
def init_image_dataset(): for genre in classes.values(): # Create output directory if not os.path.exists(out_path + genre): os.mkdir(out_path + genre) # Get all audio files files = os.listdir(in_path + genre) for f in files: # Define paths audio_path = in_path + genre + "/" + f spec_path = out_path + genre + "/" + f # Load audio and create spectrogram audio, fs = load_audio(audio_path) audio2spectrogram(audio, fs, spec_path) print("Saved:", spec_path)
def init_analytics_dataset(): with open('dataset.csv', 'w', newline='') as file: writer = csv.writer(file) header = "chroma_freqs spectral_centroid spectral_bandwidth spectral_rolloff zero_crossing_rate" for i in range(1, 21): header += " mfcc" + str(i) header += " genre" writer.writerow(header.split()) # Writing Data for genre in classes.values(): # Get all audio files files = os.listdir(in_path + genre) for f in files: audio, fs = load_audio(in_path + genre + "/" + f) features = extract_features(audio, fs) features.append(genre) writer.writerow(features) print("Features extracted:", f)
def neural_predicate(self, network, path, in_training=True, versions=3): data = load_audio(str(path)[1:-1]) if in_training: sig_t, sr, _ = self.t_transforms.apply(data, None) else: sig_t, sr, _ = self.v_transforms.apply(data, None) # print(path) length = torch.tensor(sig_t.size(0)) sr = torch.tensor(sr) data = [d.unsqueeze(0) for d in [sig_t, length, sr]] try: out_raw = network.net(data) except RuntimeError: print(path) print(data) raise return out_raw.squeeze(0)
def wav2spectrogram(filename: str, max_length: int = None, sr: int = 16000, start_pos: float = 0.0, **kwargs): """Convert wavefile to complex CWT spectrogram Args: filename (str): Wav filename max_length (int, optional): Maximum signal length [s]. Defaults to None. sr (int, optional): Sampling rate [Hz]. Defaults to 16000. start_pos (float, optional): Anlysis start position of waveform [s]. Defaults to 0.0. **kwargs: Parameters for pyfacwt.FACWT Returns: numpy.ndarray: Complex CWT spectrogram """ wavdata = load_audio(filename, sr=sr, mono=True) if start_pos > 0.0: wavdata = wavdata[int(start_pos * sr):] if max_length is not None: wavdata = wavdata[:int(max_length * sr)] # setup FACWT facwt_params = dict(lowFreq=kwargs.get("lowFreq", 27.5), highFreq=kwargs.get("highFreq", sr / 2), fs=sr, resol=kwargs.get("resol", 3), width=kwargs.get("width", 2.0), sd=kwargs.get("sd", numpy.log(2.0) / 60.0), alpha=kwargs.get("alpha", 1.0), multirate=kwargs.get("multirate", False), minWidth=kwargs.get("minWidth", 2), waveletType=kwargs.get("waveletType", "log_normal"), verbose=kwargs.get("verbose", 1)) facwt = FACWT(wavdata.shape[0], **facwt_params) # forward computation spectrogram = facwt.forward(wavdata) return facwt, facwt_params, spectrogram
def _make_example(self, wav_name, text): wav_file = os.path.join(self.wav_dir, wav_name + '.wav') wav = load_audio(wav_file) mel, mag = get_spectrogram(wav) return {'text': text, 'mel': mel, 'mag': mag}
def main(): sr = 44100 hop_length = 512 y_in = utils.load_audio("Awake.wav", sr) y_out = utils.load_audio("Light.wav", sr) cross_interval(y_in, y_out, sr, hop_length, 10)
def load_audio_files(self, jam_file): mix, sr = utils.load_audio(jam_file[:-4] + 'wav') mix = mix[0] jam = jams.load(jam_file) data = jam.annotations[0]['data']['value'] classes = self.source_labels sources = [] one_hots = [] group = [] used_classes = [] keep_columns = [] for d in data: if d['role'] == 'foreground': source_path = d['saved_source_file'] source_path = os.path.join(self.folder, source_path.split('/')[-1]) sources.append(utils.load_audio(source_path)[0][0]) one_hot = np.zeros(len(classes)) one_hot[self.source_indices[d['label']]] = 1 used_classes.append(d['label']) one_hots.append(one_hot) if d['label'] in self.group_sources or d[ 'label'] in self.ignore_sources: group.append(sources[-1]) sources.pop() one_hots.pop() used_classes.pop() else: keep_columns.append(self.source_indices[d['label']]) if len(self.group_sources) > 0: sources.append(sum(group)) one_hot = np.zeros(len(classes)) one_hot[self.source_indices['group']] = 1 used_classes.append('group') one_hots.append(one_hot) keep_columns.append(self.source_indices['group']) if self.num_extra_sources > 0: num_sources = len(sources) shuffled = random.sample(classes, len(classes)) for class_name in shuffled: if class_name not in used_classes: if len(sources) >= num_sources + self.num_extra_sources: break one_hot = np.zeros(len(classes)) one_hot[classes.index(class_name)] = 1 one_hots.append(one_hot) sources.append(np.zeros(sources[-1].shape)) used_classes.append(class_name) length_cutoff = int(mix.shape[0] * self.length) mix = mix[:length_cutoff] sources = [source[:length_cutoff] for source in sources] if self.reorder_sources: source_order = [ used_classes.index(c) for c in self.source_labels if c in used_classes ] sources = [sources[i] for i in source_order] one_hots = [one_hots[i] for i in source_order] if self.group_sources: one_hots = np.stack(one_hots)[:, sorted(keep_columns)] else: one_hots = np.stack(one_hots) return mix, sources, one_hots
def main(unused_argv=None): tf.logging.set_verbosity(FLAGS.log) if FLAGS.checkpoint_path: checkpoint_path = utils.shell_path(FLAGS.checkpoint_path) else: expdir = utils.shell_path(FLAGS.expdir) tf.logging.info("Will load latest checkpoint from %s.", expdir) while not tf.gfile.Exists(expdir): tf.logging.fatal("\tExperiment save dir '%s' does not exist!", expdir) sys.exit(1) try: checkpoint_path = tf.train.latest_checkpoint(expdir) except tf.errors.NotFoundError: tf.logging.fatal("There was a problem determining the latest checkpoint.") sys.exit(1) if not tf.train.checkpoint_exists(checkpoint_path): tf.logging.fatal("Invalid checkpoint path: %s", checkpoint_path) sys.exit(1) tf.logging.info("Will restore from checkpoint: %s", checkpoint_path) source_path = utils.shell_path(FLAGS.source_path) tf.logging.info("Will load Wavs from %s." % source_path) save_path = utils.shell_path(FLAGS.save_path) tf.logging.info("Will save embeddings to %s." % save_path) if not tf.gfile.Exists(save_path): tf.logging.info("Creating save directory...") tf.gfile.MakeDirs(save_path) sample_length = FLAGS.sample_length batch_size = FLAGS.batch_size def is_wav(f): return f.lower().endswith(".wav") wavfiles = sorted([ os.path.join(source_path, fname) for fname in tf.gfile.ListDirectory(source_path) if is_wav(fname) ]) for start_file in xrange(0, len(wavfiles), batch_size): batch_number = (start_file / batch_size) + 1 tf.logging.info("On file number %s (batch %d).", start_file, batch_number) end_file = start_file + batch_size wavefiles_batch = wavfiles[start_file:end_file] # Ensure that files has batch_size elements. batch_filler = batch_size - len(wavefiles_batch) wavefiles_batch.extend(batch_filler * [wavefiles_batch[-1]]) wav_data = np.array( [utils.load_audio(f, sample_length) for f in wavefiles_batch]) try: tf.reset_default_graph() # Load up the model for encoding and find the encoding encoding = encode(wav_data, checkpoint_path, sample_length=sample_length) if encoding.ndim == 2: encoding = np.expand_dims(encoding, 0) tf.logging.info("Encoding:") tf.logging.info(encoding.shape) tf.logging.info("Sample length: %d" % sample_length) for num, (wavfile, enc) in enumerate(zip(wavefiles_batch, encoding)): filename = "%s_embeddings.npy" % wavfile.split("/")[-1].strip(".wav") with tf.gfile.Open(os.path.join(save_path, filename), "w") as f: np.save(f, enc) if num + batch_filler + 1 == batch_size: break except Exception as e: tf.logging.info("Unexpected error happened: %s.", e) raise
def __getitem__(self, index): # first, get target track track_path = self.tracks[index]['path'] min_duration = self.tracks[index]['min_duration'] if self.random_chunks: # determine start seek by target duration start = random.uniform(0, min_duration - self.seq_duration) else: start = 0 # assemble the mixture of target and interferers audio_sources = [] midi_sources = [] start_ends = [] # load target # random choose target self.source_files = random.sample(self.source_files, len(self.source_files)) for index, source in enumerate(self.source_files): if self.random_chunks: # determine start seek by target duration start = random.uniform(0, min_duration - self.seq_duration) else: start = 0 audio = load_audio(track_path / source, start=start, dur=self.seq_duration) audio = torch.unsqueeze(self.source_augmentations(audio), 0) print('audio.shape', audio.shape) audio_sources.append(audio) start_ends.append((start, start + self.seq_duration)) midi_path = os.path.join(str(track_path), source.split('.')[0] + '.txt') midi_sources.append(midi_path) stems = torch.stack(audio_sources) # # apply linear mix over source index=0 x = stems.sum(0) # target is always the first element in the list y = stems[-1] # time series to stft x = self.stft.forward(x) x = self.spec.forward(x) y = self.stft.forward(y) y = self.spec.forward(y) # Hard Mask to Soft Mask mask_accom = midi_to_mask( x.permute(1, 0, 2)[0].numpy(), midi_sources[0], start_ends[0]) mask_target = midi_to_mask( x.permute(1, 0, 2)[0].numpy(), midi_sources[-1], start_ends[-1]) mask_target = mask_target / (mask_target + mask_accom) x_filtered = mask_target * x.permute(1, 0, 2)[0].numpy() # Expand dimensions for the model x_filtered = torch.tensor(np.expand_dims(x_filtered, 1)) return x, y, x_filtered
def load_and_preprocess(file_path, sr, bits): x = load_audio(file_path.numpy(), sr) x = encode_mulaw(x, bits) return x
for i in range(generation_step): preds = model.predict(np.expand_dims(pred_seed, 0)) # prediction with the model sampled = sample(preds[0][-1]) # multinomial sampling # To prevent dead silence. if sampled == prev_sample: equal_cnt += 1 else: equal_cnt = 0 prev_sample = sampled sampled_onehot = np.zeros([1, 1, input_dim]) sampled_onehot[0][0][sampled] = 1 # make the sample into onehot generated_sample = np.append(generated_sample, sampled_onehot, axis=1) # append generated sample pred_seed = generated_sample[0][i + 1:i + 1 + sample_len] # make new seed as generation input if equal_cnt > 1000: impulse_audio = load_audio(impulse[impulse_idx]) impulse_audio = mu_quantize(impulse_audio, input_dim) impulse_audio = impulse_audio[:1000] impulse_audio = q_to_one_hot(impulse_audio, input_dim) for j in range(1000): pred_seed[sample_len - 1000 + j] = impulse_audio[j] print('Inject impulse.') if impulse_idx == len(impulse) - 1: impulse_idx = 0 else: impulse_idx += 1 equal_cnt = 0 print('generated %ith sample ==> %i (equal_cnt = %i)' % ((i + 1), sampled, equal_cnt), end='\r') # Save generated samples as a flie