def match_to_phase(self, reconst, phase): width = reconst.shape[2] reconst_exp = conversions.from_log( reconst, options=self.conv_options, make_pos_fn=lambda val: val.clamp(min=0.0), exp_fn=torch.exp) for _ in range(self.iters): samples = self.stft.inverse(reconst_exp, phase) reconst_exp, _ = self.stft.transform(samples) reconst_exp = reconst_exp[:, :, :width] reconst = conversions.to_log( reconst_exp.clamp(min=0.0), options=self.conv_options, log_fn=torch.log) return reconst
def __init__(self, data_path, example_tensor, features, get_phase=False, num_speakers=100, speaker_start_index=0, speaker_take_count=40, utterance_take_count=24): self.data_path = data_path self.example_tensor = example_tensor with open(os.path.join(data_path, 'conv_options.pkl'), 'rb') as f: self.conv_options = pickle.load(f) self.direct_feature_extractor = lambda mag_frames: \ to_torch(mag_frames, example_tensor) if features == 'direct': self.feature_extractor = self.direct_feature_extractor elif features == 'log': self.feature_extractor = lambda mag_frames: \ to_torch(conversions.to_log( mag_frames, self.conv_options), example_tensor) elif features == 'mag_norm': band_mags = np.load(os.path.join(data_path, 'band_mags.npy')) self.feature_extractor = lambda mag_frames: \ to_torch(conversions.to_mag_norm( mag_frames, band_mags, self.conv_options), example_tensor) elif features == 'two': band_mags = np.load(os.path.join(data_path, 'band_mags.npy')) self.feature_extractor = lambda mag_frames: \ to_torch(conversions.to_two( mag_frames, band_mags, self.conv_options), example_tensor) else: raise RuntimeError("Invalid feature type: " + features) self.get_phase = get_phase self.num_speakers = num_speakers self.speaker_start_index = speaker_start_index self.speaker_take_count = speaker_take_count self.utterance_take_count = utterance_take_count
DATA_PATH, "speech_" + str(speaker) + ".npy") speech = np.load(path) path = os.path.join( DATA_PATH, "sizes_" + str(speaker) + ".npy") sizes = np.load(path) num_utterances = sizes.shape[0] indices = np.concatenate([[0], np.cumsum(sizes)]) encoded = example_tensor.new_zeros(num_utterances, ENCODED_DIM) for utterance in range(num_utterances): start_index = indices[utterance] end_index = indices[utterance + 1] value = to_torch(conversions.to_log( speech[start_index:end_index], conv_options), example_tensor) encoded[utterance] = encoder(value)[0].detach() eligible_set = torch.arange(num_utterances) while eligible_set.size()[0] > 1: num_eligible = eligible_set.size()[0] encoded_subset = encoded[eligible_set] encoded_mean = encoded_subset.mean(dim=0, keepdim=True) sq_distances = ((encoded_subset - encoded_mean) ** 2).sum(dim=1) _, best_indices = torch.topk( sq_distances, num_eligible // 2, largest=False) eligible_set = eligible_set[best_indices] center = encoded[eligible_set[0]].unsqueeze(0)
encoder = encoder.cuda() result = torch.zeros(1, ENCODED_DIM) files = [f for f in os.listdir(INPUT_PATH) if f.find('.wav') != -1] encoded = example_tensor.new_zeros(len(files), ENCODED_DIM) print("Found %d files" % len(files)) for (i, file) in enumerate(files): path = os.path.join(INPUT_PATH, file) speech, _ = conversions.encode(SAMPLE_RATE, conversions.load_wav(path, SAMPLE_RATE), conv_options) value = to_torch(conversions.to_log(speech, conv_options), example_tensor) encoded[i] = encoder(value).detach() eligible_set = torch.arange(len(files)) while eligible_set.size()[0] > 1: num_eligible = eligible_set.size()[0] encoded_subset = encoded[eligible_set] encoded_mean = encoded_subset.mean(dim=0, keepdim=True) sq_distances = ((encoded_subset - encoded_mean)**2).sum(dim=1) _, best_indices = torch.topk(sq_distances, num_eligible // 2, largest=False) eligible_set = eligible_set[best_indices] result = encoded[eligible_set[0]].unsqueeze(0).cpu()