def convert_image(self, image_path): img = cv2.imread(image_path) text = pytesseract.image_to_string(img, lang='eng', config='--psm 6') image_path = os.path.normpath(image_path) file_name = image_path.split(os.sep)[-2] tag_name = image_path.split(os.sep)[-1].split(".")[0] file_path = os.path.join(self._text_out_dir, file_name) if not os.path.exists(file_path): os.makedirs(file_path) text_file_path = os.path.join(file_path, tag_name + ".txt") print_error(text_file_path) fd = open(text_file_path, "w") fd.write("%s" % text) return text_file_path
def __init__(self, hparams=None, data_iterator=None): """ https://arxiv.org/abs/1508.04306 :param hparams: :param data_iterator: """ # ITextFeature.__init__(self) ModelBase.__init__(self, hparams=hparams) ShabdaWavPairFeature.__init__(self) self._hparams = HParams(hparams, self.default_hparams()) print_error(self._hparams) self.lstm_hidden_size = self._hparams.lstm_hidden_size self.batch_size = self._hparams.batch_size self.p_keep_ff = self._hparams.p_keep_ff self.p_keep_rc = self._hparams.p_keep_rc self.neff = self._hparams.neff self.embd_dim_k = self._hparams.embd_dim_k self.frames_per_sample = self._hparams.frames_per_sample self.weights = None self.biases = None
def _get_speech_features(wav_file_1, wav_file_2, sampling_rate, frame_size, neff, amp_fac, min_amp, threshold, global_mean, global_std, frames_per_sample): """ :param args: :return: """ # (wav_file_1, wav_file_2, sampling_rate, frame_size, neff, amp_fac, min_amp, # threshold, global_mean, global_std, frames_per_sample) = args try: # TODO: experimental though - is multi-processing required here? to reduce IO # print_info("{} {}".format(wav_file_1, wav_file_2)) def _get_data(): speech_1 = _get_speech_data(wav_file_1, sampling_rate) speech_2 = _get_speech_data(wav_file_2, sampling_rate) # print(wav_file_1, wav_file_2) # find the minimum length of two speeches length = min(len(speech_1), len(speech_2)) #trim both the speeches to the minimum length speech_1 = speech_1[:length] speech_2 = speech_2[:length] # mix the signals speech_mix = speech_1 + speech_2 #get the spectral features in dB speech_1_features = _get_log_spectrum_features( speech_1, frame_size, neff, amp_fac, min_amp) speech_2_features = _get_log_spectrum_features( speech_2, frame_size, neff, amp_fac, min_amp) speech_mix_features = _get_log_spectrum_features( speech_mix, frame_size, neff, amp_fac, min_amp) max_mag = np.max(speech_mix_features) # apply threshold to the feature signal, to find the silent portion of the signal and # construct a boolean array as a feature # https://en.wikipedia.org/wiki/Voice_activity_detection speech_voice_activity_detection = (speech_mix_features > (max_mag - threshold)) # normalize the signal values with given global mean and std speech_mix_features_final = (speech_mix_features - global_mean) / global_std number_frames = speech_1_features.shape[0] new_data = _get_speech_samples(speech_1_features, speech_2_features, frames_per_sample, number_frames, speech_mix_features_final, speech_voice_activity_detection) # print_error("deleting speech_1_features, speech_2_features, speech_voice_activity_detection, speech_mix_features") speech_1_features = speech_2_features = speech_voice_activity_detection = speech_mix_features = speech_mix_features_final = None return new_data new_data = _get_data() except (KeyboardInterrupt, SystemExit): raise except Exception as err: print_error(traceback.print_exc()) new_data = [] return new_data
def generate_features(self, wav_file_1, wav_file_2): try: start = time.time() speech_1, _ = librosa.core.load(wav_file_1, sr=self._hparams.sampling_rate) # amp factor between -3 dB - 3 dB fac = np.random.rand(1)[0] * 6 - 3 speech_1 = 10.**(fac / 20) * speech_1 speech_2, _ = librosa.core.load(wav_file_2, sr=self._hparams.sampling_rate) fac = np.random.rand(1)[0] * 6 - 3 speech_2 = 10.**(fac / 20) * speech_2 # mix length = min(len(speech_1), len(speech_2)) speech_1 = speech_1[:length] speech_2 = speech_2[:length] speech_mix = speech_1 + speech_2 # compute log spectrum for 1st speaker speech_1_features = np.abs( stft(speech_1, self._hparams.frame_size)[:, :self._hparams.neff]) speech_1_features = np.maximum( speech_1_features, np.max(speech_1_features) / self._hparams.min_amp) speech_1_features = 20. * np.log10( speech_1_features * self._hparams.amp_fac) # same for the 2nd speaker speech_2_features = np.abs( stft(speech_2, self._hparams.frame_size)[:, :self._hparams.neff]) speech_2_features = np.maximum( speech_2_features, np.max(speech_2_features) / self._hparams.min_amp) speech_2_features = 20. * np.log10( speech_2_features * self._hparams.amp_fac) # same for the mixture speech_mix_spec0 = stft( speech_mix, self._hparams.frame_size)[:, :self._hparams.neff] speech_mix_features = np.abs(speech_mix_spec0) # speech_phase = speech_mix_spec0 / speech_mix_spec speech_mix_features = np.maximum( speech_mix_features, np.max(speech_mix_features) / self._hparams.min_amp) speech_mix_features = 20. * np.log10( speech_mix_features * self._hparams.amp_fac) max_mag = np.max(speech_mix_features) # if np.isnan(max_mag): # import ipdb; ipdb.set_trace() speech_VAD = (speech_mix_features > (max_mag - self._hparams.threshold)).astype(int) speech_mix_features = ( speech_mix_features - self._hparams.global_mean) / self._hparams.global_std #The ideal binary mask gives ownership of a time-frequency bin to the source whose magnitude is # maximum among all sources in that bin. # The mask values were assigned with 1 for active and 0 otherwise (binary), # making Y x Y^T as the ideal affinity matrix for the mixture. Y = np.array([ speech_1_features > speech_2_features, speech_1_features < speech_2_features ]).astype('bool') Y = np.transpose(Y, [1, 2, 0]).astype('bool') # speech_mix_features = speech_mix_features[0:self._hparams.dummy_slicing_dim, :] # speech_VAD = speech_VAD[0:self._hparams.dummy_slicing_dim, :] # Y = Y[0:self._hparams.dummy_slicing_dim, :, :] # print_info("{} vs {}".format(wav_file_1, wav_file_2)) end = time.time() print_info("Thread name: {} : took {}".format( threading.currentThread().getName(), end - start)) if speech_mix_features.shape[0] != 1247 or speech_VAD.shape[ 0] != 1247 or Y.shape[0] != 1247: raise Exception("Found files with improper duration/data") return speech_mix_features.astype('float32'), speech_VAD.astype( 'bool'), Y.astype('bool') except Exception as e: print_warn(e) print_error("{} vs {}".format(wav_file_1, wav_file_2)) return np.random.random((self._hparams.dummy_slicing_dim,129)).astype('float32'), \ np.empty((self._hparams.dummy_slicing_dim,129), dtype="bool"), \ np.empty((self._hparams.dummy_slicing_dim,129, 2), dtype="bool")
def discriminator(self, x, out_channel_dim, is_training=True, reuse=False): # It must be Auto-Encoder style architecture # Architecture : (64)4c2s-FC32_BR-FC64*14*14_BR-(1)4dc2s_S with tf.variable_scope("namespace_discriminator", reuse=reuse): # net = tf.nn.relu(conv2d(x, 64, 4, 4, 2, 2, name='d_conv1')) net = tf.layers.conv2d( x, 64, 4, strides=2, padding='same', kernel_initializer=tf.random_normal_initializer(stddev=0.02), name='d_conv1') net = tf.nn.relu(net) tf.logging.info("======> net: {}".format(net)) print_error("net1: {} ".format(net)) size = (self.image_size // 2) net = tf.reshape( net, [self._data_iterator.batch_size, size * size * 64]) # code = tf.nn.relu(bn(linear(net, 32, scope='d_fc6'), is_training=is_training, scope='d_bn6')) code = tf.contrib.layers.fully_connected(inputs=net, num_outputs=32, scope="d_fc6") code = tf.contrib.layers.batch_norm(code, decay=0.9, updates_collections=None, epsilon=1e-5, scale=True, is_training=is_training, scope='d_bn6') code = tf.nn.relu(code) print_error("code: {} ".format(code)) # net = tf.nn.relu(bn(linear(code, 64 * 14 * 14, scope='d_fc3'), is_training=is_training, scope='d_bn3')) size = (self.image_size // 2) net = tf.contrib.layers.fully_connected(inputs=code, num_outputs=64 * size * size, scope="d_fc3") net = tf.contrib.layers.batch_norm(net, decay=0.9, updates_collections=None, epsilon=1e-5, scale=True, is_training=is_training, scope='d_bn3') print_error("net: {} ".format(net)) print_error(net) size = (self.image_size // 2) net = tf.reshape(net, [self._data_iterator.batch_size, size, size, 64]) print_error(net) # out = tf.nn.sigmoid(deconv2d(net, [self.gan_config.batch_size, 28, 28, 1], 4, 4, 2, 2, name='d_dc5')) net = tf.layers.conv2d_transpose(net, out_channel_dim, 4, strides=2, padding='same', name='d_dc5') out = tf.nn.sigmoid(net) print_info("==================================") print_info(out) print_info(x) # recon loss recon_error = tf.sqrt( 2 * tf.nn.l2_loss(out - x)) / self._data_iterator.batch_size print_info("==================================") print_error(recon_error) return out, recon_error, code
def visulaize(self, executor, file_path): """ :param executor: :param test_file_path: :return: """ estimator = executor.estimator in_data_features, voice_activity_detection_data_features, phase_features = self._get_predict_samples( file_path=file_path) in_data_features = np.asarray(in_data_features) voice_activity_detection_data_features = np.asarray( voice_activity_detection_data_features) N_frames = in_data_features.shape[0] hop_size = self._hparams.frame_size // 4 def get_dataset(): dataset = tf.data.Dataset.from_tensor_slices(({ self.FEATURE_1_NAME: in_data_features, self.FEATURE_2_NAME: voice_activity_detection_data_features }, np.ones_like(in_data_features))) dataset = dataset.batch(batch_size=1) print_info(dataset.output_shapes) return dataset predict_fn = estimator.predict(input_fn=lambda: get_dataset()) print_info("Shape of in data: {}".format(in_data_features.shape)) print_info("Number of frames for given file: {}".format(N_frames)) embeddings = [] i = 0 for predicted_value in predict_fn: # print("i = {}".format(i)) """ TODO: strange behaviour! 1 wav file = N samples Eg: N = 600 FramesPerSample=100, BatchSize = 1, NEFF = 129, EMD_K = 30 For each sample the embeddings is of shape [batch_size * frames_per_sample, NEFF, embd_dim]. For prediction batch size is made 1. Hence the embeddings colapse to [frames_per_sample, NEFF, embd_dim] 1 sample predictions will have `frames_per_sample` outputs Eg: If input audio file has 75 frames, the prediction will have [7500, NEFF, embd_dim] """ embeddings.append(predicted_value) i += 1 print_info("Number of embeddings predicted for given file: {}".format( len(embeddings))) print_error(np.asarray(embeddings).shape) N_assign = 0 step = 0 for frame_i in tqdm(range(N_frames)): # expand the dimesion to be inline with TF batch size in_data_np = np.expand_dims(in_data_features[frame_i], axis=0) in_phase_np = np.expand_dims(phase_features[frame_i], axis=0) voice_activity_detection_data_np = np.expand_dims( voice_activity_detection_data_features[frame_i], axis=0) embedding_np = np.asarray( embeddings[frame_i:frame_i + self._hparams.frames_per_sample]) # ---------------------------------------------- embedding_ac = [] for i, j in itertools.product( range(self._hparams.frames_per_sample), range(self._hparams.neff)): if voice_activity_detection_data_np[0, i, j] == 1: embedding_ac.append(embedding_np[i, j, :]) kmean = KMeans(n_clusters=2, random_state=0).fit(embedding_ac) # visualization using 3 PCA pca_Data = PCA(n_components=3).fit_transform(embedding_ac) fig = plt.figure(1, figsize=(8, 6)) ax = Axes3D(fig, elev=-150, azim=110) # ax.scatter(pca_Data[:, 0], pca_Data[:, 1], pca_Data[:, 2], # c=kmean.labels_, cmap=plt.cm.Paired) ax.scatter(pca_Data[:, 0], pca_Data[:, 1], pca_Data[:, 2], cmap=plt.cm.Paired) ax.set_title('Embedding visualization using the first 3 PCs') ax.set_xlabel('1st pc') ax.set_ylabel('2nd pc') ax.set_zlabel('3rd pc') if not os.path.exists("vis"): os.makedirs("vis") plt.savefig('vis/' + str(step) + 'pca.jpg') step += 1
def discriminator(self, images, input_z, reuse=False): """ Create the _discriminator network :param image: Tensor of input image(s) :param reuse: Boolean if the weights should be reused :return: Tuple of (tensor output of the _discriminator, tensor logits of the _discriminator) """ _, width, height, channel = images.get_shape().as_list() print_error("{}".format([width, height, channel])) with tf.variable_scope('_discriminator', reuse=reuse): # y = tf.reshape(input_z, [-1, 1, 1, 740], name="y_reshape") #2*2*185=>740 # input_z = tf.layers.batch_normalization(input_z) # input_z = tf.layers.dense(input_z, width*height*channel) # y = tf.reshape(input_z, [-1, width,height,channel], name="y_reshape") # print_error(y) # x1 = conv_cond_concat(y,images) ''' # c_code = tf.expand_dims(tf.expand_dims(input_z, 1), 1) # c_code = tf.tile(c_code, [1, 1, 1, 740]) # x1 = tf.concat([images, c_code], 3) # print_error(x1) ''' # x1 = tf.layers.batch_normalization(images) # Input layer consider ?x32x32x3 x1 = tf.layers.conv2d( images, 64, 5, strides=2, padding='same', kernel_initializer=tf.truncated_normal_initializer( stddev=0.02)) x1 = tf.maximum(self.alpha * x1, x1) x1 = tf.layers.batch_normalization(x1, training=True) # relu1 = tf.layers.dropout(relu1, rate=0.5) # 16x16x64 # print(x1) x2 = tf.layers.conv2d( x1, 128, 5, strides=2, padding='same', kernel_initializer=tf.truncated_normal_initializer( stddev=0.02)) x2 = tf.maximum(self.alpha * x2, x2) x2 = tf.layers.batch_normalization(x2, training=True) # relu2 = tf.layers.dropout(relu2, rate=0.5) # 8x8x128 # print(x2) x3 = tf.layers.conv2d( x2, 256, 5, strides=2, padding='same', kernel_initializer=tf.truncated_normal_initializer( stddev=0.02)) x3 = tf.maximum(self.alpha * x3, x3) x3 = tf.layers.batch_normalization(x3, training=True) # relu3 = tf.layers.dropout(relu3, rate=0.5) # 4x4x256 # print(x3) # Flatten it flat = tf.reshape(x3, (-1, 4 * 4 * 256)) flat = tf.concat([flat, input_z], -1) # conditioned_fully_connected_layer = tf.concat([flat], axis=-1) # # flat = tf.layers.dense(flat, 512) # flat = tf.layers.dense(flat, 1024) # flat = tf.layers.dense(flat, 512) # flat = tf.layers.dense(flat, 256) # flat = tf.layers.dense(flat, 128) logits = tf.layers.dense(flat, 512) logits = tf.maximum(self.alpha * logits, logits) logits = tf.layers.dense(logits, 1) # print(logits) out = tf.sigmoid(logits) # print('_discriminator out: ', out) print_info("======>out: {}".format(out)) return out, logits
def predict_on_instance(self, executor, file_path): """ Given a mixed audio file, it generates three audio files : mix recontructed, source1, source 2 #TODO debug!!! :param executor: :param test_file_path: :return: """ estimator = executor.estimator in_data_features, voice_activity_detection_data_features, phase_features = self._get_predict_samples( file_path=file_path) print_info("in_data_features original shape: {}".format( in_data_features.shape)) voice_activity_detection_data_features = np.asarray( voice_activity_detection_data_features) num_samples_N = in_data_features.shape[0] hop_size = self._hparams.frame_size // 4 # 256 / 4 = 64 # (38 * 100 -1) * 64 + 256 = 243392 for wav file of 10 seconds appended 3 three times i.e 10 * 8000 (Sampling rate) * = 240000 out_audio1 = np.zeros([ (num_samples_N * self._hparams.frames_per_sample - 1) * hop_size + self._hparams.frame_size ]) out_audio2 = np.zeros([ (num_samples_N * self._hparams.frames_per_sample - 1) * hop_size + self._hparams.frame_size ]) mix = np.zeros([ (num_samples_N * self._hparams.frames_per_sample - 1) * hop_size + self._hparams.frame_size ]) def get_dataset(): dataset = tf.data.Dataset.from_tensor_slices(({ self.FEATURE_1_NAME: in_data_features, self.FEATURE_2_NAME: voice_activity_detection_data_features }, np.ones_like(in_data_features))) dataset = dataset.batch(batch_size=1) print_info(dataset.output_shapes) return dataset predict_fn = estimator.predict(input_fn=lambda: get_dataset()) print_info("Shape of in data: {}".format(in_data_features.shape)) print_info("Number of sample for given file: {}".format(num_samples_N)) embeddings = [] i = 0 for predicted_value in predict_fn: # print("i = {}".format(i)) """ TODO: strange behaviour! 1 wav file = N samples Eg: N = 600 FramesPerSample=100, BatchSize = 1, NEFF = 129, EMD_K = 30 For each sample the embeddings is of shape [batch_size * frames_per_sample, NEFF, embd_dim]. For prediction batch size is made 1. Hence the embeddings colapse to [frames_per_sample, NEFF, embd_dim] 1 sample predictions will have `frames_per_sample` outputs Eg: If input audio file has 75 frames, the prediction will have [7500, NEFF, embd_dim] """ embeddings.append(predicted_value) i += 1 # Number of embeddings predicted for given file: 3800 with shape (3800, 129, 40) [number of samples * frames_per_sample, NEFF, EMBD_DIM] print_info( "Number of embeddings predicted for given file: {} with shape {}". format(len(embeddings), np.asarray(embeddings).shape)) N_assign = 0 # # for every chunk of frames of data for sample_i in tqdm(range(num_samples_N)): # num_samples = 38 # expand the dimesion to be inline with TF batch size in_data_np = np.expand_dims(in_data_features[sample_i], axis=0) in_phase_np = np.expand_dims(phase_features[sample_i], axis=0) voice_activity_detection_data_np = np.expand_dims( voice_activity_detection_data_features[sample_i], axis=0) print_info("in_data_np : ") print_error(in_data_np.shape) """ 0*100 to (0+1)*100 = 0 to 100 1*100 to (1+1)*100 = 100 to 200 2*100 to (2+1)*100 = 200 to 300 """ embedding_np = np.asarray( embeddings[sample_i * self._hparams.frames_per_sample:(sample_i + 1) * self._hparams.frames_per_sample]) # ---------------------------------------------- # embedding_ac = [] # for i, j in itertools.product(range(self._hparams.frames_per_sample), range(self._hparams.neff)): # if voice_activity_detection_data_np[0, i, j] == 1: # embedding_ac.append(embedding_np[i, j, :]) embedding_ac = [ embedding_np[i, j, :] for i, j in itertools.product( range(self._hparams.frames_per_sample), range(self._hparams.neff)) if voice_activity_detection_data_np[0, i, j] == 1 ] print_error(np.array(embedding_ac).shape) if embedding_ac == []: break kmean = KMeans(n_clusters=2, random_state=0).fit(embedding_ac) # ---------------------------------------------- mask = np.zeros( [self._hparams.frames_per_sample, self._hparams.neff, 2]) ind = 0 # print_info("N_assign : {}".format(N_assign)) center = kmean.cluster_centers_ center = center * 0.7 + 0.3 * kmean.cluster_centers_ # two speakers have appeared # print_info("Found 2 speakers ...") center_new = kmean.cluster_centers_ cor = np.matmul(center_new[0, :], np.transpose(center)) # rearrange their sequence if not consistant with previous # frames # print_info("Correlation : {}".format(cor)) if (cor[1] > cor[0]): kmean.cluster_centers_ = np.array( [kmean.cluster_centers_[1], kmean.cluster_centers_[0]]) kmean.labels_ = (kmean.labels_ == 0).astype('int') kmean.labels_ = (~kmean.labels_).astype('int') # print_info("center : {}".format(center)) # print_info("kmean.labels_ : {}".format(kmean.labels_)) # ---------------------------------------------- # print_error(kmean.labels_) # transform the clustering result and voice_activity_detection info. into masks for i in range(self._hparams.frames_per_sample): for j in range(self._hparams.neff): if voice_activity_detection_data_np[0, i, j] == 1: mask[i, j, kmean.labels_[ind]] = 1 ind += 1 for i in range(self._hparams.frames_per_sample): # apply the mask and reconstruct the waveform tot_ind = sample_i * self._hparams.frames_per_sample + i # ipdb.set_trace() amp = in_data_np[ 0, i, :] * self._hparams.global_std + self._hparams.global_mean out_data1 = (mask[i, :, 0] * amp * voice_activity_detection_data_np[0, i, :]) out_data2 = (mask[i, :, 1] * amp * voice_activity_detection_data_np[0, i, :]) out_mix = amp out_data1_l = 10**(out_data1 / 20) / self._hparams.amp_fac out_data2_l = 10**(out_data2 / 20) / self._hparams.amp_fac out_mix_l = 10**(out_mix / 20) / self._hparams.amp_fac out_stft1 = out_data1_l * in_phase_np[0, i, :] out_stft2 = out_data2_l * in_phase_np[0, i, :] out_stft_mix = out_mix_l * in_phase_np[0, i, :] con_data1 = out_stft1[-2:0:-1].conjugate() con_data2 = out_stft2[-2:0:-1].conjugate() con_mix = out_stft_mix[-2:0:-1].conjugate() out1 = np.concatenate((out_stft1, con_data1)) out2 = np.concatenate((out_stft2, con_data2)) out_mix = np.concatenate((out_stft_mix, con_mix)) frame_out1 = np.fft.ifft(out1).astype(np.float64) frame_out2 = np.fft.ifft(out2).astype(np.float64) frame_mix = np.fft.ifft(out_mix).astype(np.float64) start = tot_ind * hop_size out_audio1[start:(start + len(frame_out1))] += frame_out1 * 0.5016 out_audio2[start:(start + len(frame_out2))] += frame_out2 * 0.5016 mix[start:(start + len(frame_mix))] += frame_mix * 0.5016 ## the audio has been padded 3 times in AudioReader ## restore the original audio len1 = len(out_audio1) // 3 len2 = len(out_audio2) // 3 source1 = out_audio1[len1:2 * len1] source2 = out_audio2[len2:2 * len2] mix = mix[len2:2 * len2] # Length of source_1 81130 source_1 shape (81130,) # Length of source_1 81130 source_1 shape (81130,) print_info("Length of source_1 {} source_1 shape {}".format( len1, source1.shape)) print_info("Length of source_1 {} source_1 shape {}".format( len2, source2.shape)) print_info("Writing file {}".format( os.path.splitext(file_path)[0] + "_source1.wav")) print_info("Writing file {}".format( os.path.splitext(file_path)[0] + "_source2.wav")) librosa.output.write_wav( os.path.splitext(file_path)[0] + "_source1.wav", source1, self._hparams.sampling_rate) librosa.output.write_wav( os.path.splitext(file_path)[0] + "_source2.wav", source2, self._hparams.sampling_rate) librosa.output.write_wav( os.path.splitext(file_path)[0] + "_full.wav", mix, self._hparams.sampling_rate) return [(source1, self._hparams.sampling_rate), (source2, self._hparams.sampling_rate)]