def test_generate_labels_from_classifications(self): classifications = [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1] timestamps = [0.0, 0.6965986394557823, 1.3931972789115645, 2.089795918367347, 2.786394557823129, 3.4829931972789114, 4.179591836734694, 4.876190476190477, 5.572789115646258, 6.2693877551020405, 6.965986394557823, 7.662585034013605, 8.359183673469389, 9.05578231292517, 9.752380952380953, 10.448979591836734, 11.145578231292516, 11.842176870748299, 12.538775510204081, 13.235374149659863, 13.931972789115646, 14.628571428571428, 15.32517006802721, 16.021768707482995] labels = Util.generate_labels_from_classifications(classifications, timestamps) expected_labels = [AudacityLabel(0.0, 1.3931972789115645, 1), AudacityLabel(1.3931972789115645, 9.752380952380953, 0), AudacityLabel(9.752380952380953, 10.448979591836736, 1), AudacityLabel(10.448979591836734, 11.145578231292516, 0), AudacityLabel(11.145578231292516, 12.538775510204081, 1), AudacityLabel(12.538775510204081, 13.235374149659863, 0), AudacityLabel(13.235374149659863, 16.718367346938777, 1)] self.assertListEqual(expected_labels, labels)
def calculate_fusion(youtube_video_id, lbls_dir, audio_lbls, image_lbls, duration, step=0.1, neighbours_before_after=6, times_greater=2): # 100ms pairs, timestamps = create_pairs(audio_lbls, image_lbls, duration, step) mapping_face_to_voice = detect_face_voice_mapping(pairs) # print(mapping_face_to_voice) pairs = apply_mapping_to_pairs(pairs, mapping_face_to_voice) # print(pairs) seconds_of_mismatch = 0 for k, pair in enumerate(pairs): if pair.image_class is None: # when image is None continue classes = pair.image_class.split(",") # if only one face has been detected then assume it's the face of the speaker if len(classes) == 1 and pair.audio_class != 'non_speech': if pair.image_class != pair.audio_class: seconds_of_mismatch += step # print("%s != %s" % (pair.image_class, pair.audio_class)) nearest_neighbour_class = find_nearest_neighbours_class( k, pairs, neighbours_before_after=neighbours_before_after, times_greater=times_greater) pair.audio_class = nearest_neighbour_class lbls = Util.generate_labels_from_classifications( [p.audio_class for p in pairs], timestamps) lbls = list(filter(lambda x: x.label is not None, lbls)) Util.write_audacity_labels( lbls, os.path.join(lbls_dir, youtube_video_id + ".fusion.txt")) return mapping_face_to_voice
def calculate_fusion(youtube_video_id, lbls_dir, audio_lbls, image_lbls, duration, step=0.1): # 100ms pairs, timestamps = create_pairs(audio_lbls, image_lbls, duration, step) mapping_face_to_voice = detect_face_voice_mapping(pairs) print(mapping_face_to_voice) pairs = apply_mapping_to_pairs(pairs, mapping_face_to_voice) print(pairs) for k, pair in enumerate(pairs): if pair.image_class is None: # when image is None continue classes = pair.image_class.split(",") if len(classes) == 1 and pair.audio_class != 'non_speech': if pair.image_class != pair.audio_class: # print("%s != %s" % (pair.image_class, pair.audio_class)) nearest_neighbour_class = find_nearest_neighbours_class(k, pairs) pair.audio_class = nearest_neighbour_class print(pairs) lbls = Util.generate_labels_from_classifications([p.audio_class for p in pairs], timestamps) lbls = filter(lambda x: x.label is not None, lbls) json_lbls = [] for lbl in lbls: json_lbls.append({ "start_seconds": lbl.start_seconds, "end_seconds": lbl.end_seconds, "label": lbl.label }) with open(os.path.join(lbls_dir, youtube_video_id + ".json"), 'w') as outfile: json.dump(json_lbls, outfile) Util.write_audacity_labels(lbls, os.path.join(lbls_dir, youtube_video_id + ".txt")) return mapping_face_to_voice
def generate_audio_based_segmentation(audio_file, w, h, embedding_size, lstm_nodes, dropout, weights_filename, scaler_filename, window_size, step, hop_size, youtube_video_id, lbls_dir, clusters=4, sr=16000): vad = Vad() vad_lbls = vad.detect_voice_segments(audio_file) # model.load_weights(weights_filename) # feature_extractor = FeatExtractorMFCC(window_size, hop_size, w, sr, h, step=step) # X, timestamps = feature_extractor.extract(audio_file) # timestamps = numpy.array(timestamps) y, sr = librosa.load(audio_file, sr=sr) X = mfcc(y, sr=sr, n_mfcc=h, n_fft=window_size, hop_length=hop_size) timestamps = [k * hop_size / sr for k in range(0, X.shape[1])] timestamps = numpy.array(timestamps) window = timestamps[1] - timestamps[0] frame_predictions = [] for k, timestamp in enumerate(timestamps): found = False for lbl in vad_lbls: if lbl.start_seconds <= timestamp <= lbl.end_seconds - window: # need the window end to fit in the label frame_predictions.append(lbl.label) found = True break if not found: frame_predictions.append('non_speech') frame_predictions = numpy.array(frame_predictions) speech_indices = numpy.where(frame_predictions == 'speech') X_speech = X[:, speech_indices] X_speech = X_speech.reshape((X_speech.shape[0], X_speech.shape[2])) print(X_speech.shape) X_speech = X_speech.transpose() print(X_speech.shape) timestamps_speech = timestamps[speech_indices] # Util.write_audacity_labels(Util.generate_labels_from_classifications(frame_predictions, timestamps), # "vad_preds_quant.txt") # X = X_speech.reshape((X_speech.shape[0] * w, h)) # X = scaler.transform(X) # X = X.reshape(-1, w, h) # original_embeddings = intermediate.predict(X) clustering_algorithm = GaussianMixture(n_components=clusters, max_iter=1000, n_init=3) # if visualise: # # embeddings, y, classes, new_timestamps = Util.get_annotated_data_x_y(timestamps_speech, original_embeddings, # lbls_fixed) # le = preprocessing.LabelEncoder() # y = le.fit_transform(y) # # tsne = TSNE() # two_dimensional = tsne.fit_transform(embeddings) # # # pca = PCA(n_components=2) # # two_dimensional = pca.fit_transform(embeddings) # # # pca2 = PCA(n_components=20) # # pca_embeddings = pca2.fit_transform(embeddings) # # clustering_algorithm.fit(two_dimensional) # predictions = clustering_algorithm.predict(two_dimensional) # # # kmeans = KMeans(n_clusters=CLUSTERS) # # kmeans.fit(embeddings) # # plt.figure(figsize=(10, 6)) # plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=y, marker='.') # # plt.figure(figsize=(10, 6)) # plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=predictions, marker='.') # # else: tsne = TSNE(n_components=2, init='pca') two_dimensional = tsne.fit_transform(X_speech) # original_embeddings = scale((original_embeddings)) # pca = PCA(n_components=2) # two_dimensional = pca.fit_transform(original_embeddings) # pca2 = PCA(n_components=3) # pca_embeddings = pca2.fit_transform(original_embeddings) clustering_algorithm.fit(two_dimensional) predictions = clustering_algorithm.predict(two_dimensional) # kmeans = KMeans(n_clusters=CLUSTERS) # kmeans.fit(two_dimensional) # plt.figure(figsize=(10,10)) # plt.imshow(calculate_similarity_matrix(two_dimensional, metric='euclidean'), cmap='gray') # plt.figure(figsize=(10,6)) # plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=predictions, marker='.') # plt.figure(figsize=(10,6)) # plt.scatter(two_dimensional[:, 0], pca_embeddings[:, 1], c=kmeans.labels_.tolist(), marker='.') # predictions = kmeans.labels_.tolist() for k, speech_index in enumerate(speech_indices[0]): frame_predictions[speech_index] = predictions[k] lbls = Util.generate_labels_from_classifications(frame_predictions, timestamps) json_lbls = [] for lbl in lbls: json_lbls.append({ "start_seconds": lbl.start_seconds, "end_seconds": lbl.end_seconds, "label": lbl.label }) with open(os.path.join(lbls_dir, youtube_video_id + ".json"), 'w') as outfile: json.dump(json_lbls, outfile) Util.write_audacity_labels( lbls, os.path.join(lbls_dir, youtube_video_id + ".txt")) # if __name__ == '__main__': # generate_audio_based_segmentation( # "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/audios/Unamij6z1io.wav", # 15, 20, 256, 128, 0.2, # os.path.abspath('models/weights.h5'), # os.path.abspath('models/scaler.pickle'), # 1024, 3, 1024, "xxx", # "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/static/lbls/mfcc", # clusters=4 # )
def generate_face_based_segmentation(youtube_video_id, images_dir, lbls_dir, faces, predictor_path, face_rec_model_path): images_raw = glob(os.path.join(images_dir, "*.jpg")) images_raw.sort() # images_raw = images_raw[0:100] images = [ images_raw[i] for i in range(0, len(images_raw), FRAMES_PER_STEP) ] print(images) timestamps = [i * (FRAMES_PER_STEP / 25.0) for i in range(0, len(images))] print(timestamps) detector = dlib.get_frontal_face_detector() sp = dlib.shape_predictor(predictor_path) facerec = dlib.face_recognition_model_v1(face_rec_model_path) embeddings = [] embeddings_timestamps = [] landmarks_parts = [] landmarks_rect = [] for frame_no, f in enumerate(images): print("Processing file: {}".format(f)) img = io.imread(f) dets = detector(img, 1) print("Number of faces detected: {}".format(len(dets))) for k, d in enumerate(dets): shape = sp(img, d) face_descriptor = facerec.compute_face_descriptor(img, shape) embeddings.append(face_descriptor) embeddings_timestamps.append(timestamps[frame_no]) landmarks_parts.append(shape.parts()) landmarks_rect.append(shape.rect) embeddings = numpy.array(embeddings) embeddings_timestamps = numpy.array(embeddings_timestamps) print(embeddings.shape) print(embeddings_timestamps.shape) if len(embeddings) == 0: Util.write_audacity_labels([], os.path.join(lbls_dir, youtube_video_id + ".txt")) return kmeans = KMeans(n_clusters=faces) kmeans.fit(embeddings) predictions = numpy.array(kmeans.labels_.tolist()) df = pd.DataFrame({ "timestamps": embeddings_timestamps.tolist(), "predictions": predictions }) timestamps = [] classes = [] for key, group in df.groupby(['timestamps']): timestamps.append(key) classes.append(",".join( [str(i) for i in sorted(group['predictions'].tolist())])) lbls = Util.generate_labels_from_classifications(classes, timestamps) json_lbls = [] for lbl in lbls: json_lbls.append({ "start_seconds": lbl.start_seconds, "end_seconds": lbl.end_seconds, "label": lbl.label }) with open(os.path.join(lbls_dir, youtube_video_id + ".json"), 'w') as outfile: json.dump(json_lbls, outfile) Util.write_audacity_labels( lbls, os.path.join(lbls_dir, youtube_video_id + ".txt")) # if __name__ == '__main__': # # extract_images_from_video("/Users/nicktgr15/workspace/speaker_diarisation_poc/src/videos/Unamij6z1io.mp4", # "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/video_frames") # # generate_face_based_segmentation( # "Unamij6z1io", # "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/video_frames/Unamij6z1io", # "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/static/lbls/image", # 4, # "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/models/shape_predictor_68_face_landmarks.dat", # "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/models/dlib_face_recognition_resnet_model_v1.dat" # )
def generate_audio_based_segmentation(audio_file, w, h, embedding_size, lstm_nodes, dropout, weights_filename, scaler_filename, window_size, step, hop_size, youtube_video_id, lbls_dir, clusters=4, sr=16000): vad = Vad() vad_lbls = vad.detect_voice_segments(audio_file) with open(scaler_filename, 'rb') as f: scaler = pickle.load(f) model, intermediate = get_lstm_siamese((w, h), embedding_size, lstm_nodes, dropout) model.load_weights(weights_filename) feature_extractor = FeatExtractorMFCC(window_size, hop_size, w, sr, h, step=step) X, timestamps = feature_extractor.extract(audio_file) timestamps = numpy.array(timestamps) window = timestamps[1] - timestamps[0] frame_predictions = [] for k, timestamp in enumerate(timestamps): found = False for lbl in vad_lbls: if lbl.start_seconds <= timestamp <= lbl.end_seconds - window: # need the window end to fit in the label frame_predictions.append(lbl.label) found = True break if not found: frame_predictions.append('non_speech') frame_predictions = numpy.array(frame_predictions) print(frame_predictions.shape) print(timestamps.shape) speech_indices = numpy.where(frame_predictions == 'speech') X_speech = X[speech_indices] X = X_speech.reshape((X_speech.shape[0] * w, h)) X = scaler.transform(X) X = X.reshape(-1, w, h) original_embeddings = intermediate.predict(X) clustering_algorithm = KMeans(n_clusters=clusters) reducted_embeddings = original_embeddings predictions = clustering_algorithm.fit_predict(reducted_embeddings) for k, speech_index in enumerate(speech_indices[0]): frame_predictions[speech_index] = predictions[k] lbls = Util.generate_labels_from_classifications(frame_predictions, timestamps) Util.write_audacity_labels( lbls, os.path.join(lbls_dir, youtube_video_id + ".audio.txt"))
def generate_face_based_segmentation(youtube_video_id, images_dir, lbls_dir, faces, predictor_path, face_rec_model_path, tmp_dir): images_raw = glob(os.path.join(images_dir, "*.jpg")) images_raw.sort() # images_raw = images_raw[0:100] images = [ images_raw[i] for i in range(0, len(images_raw), FRAMES_PER_STEP) ] print(images) timestamps = [i * (FRAMES_PER_STEP / 25.0) for i in range(0, len(images))] print(timestamps) detector = get_frontal_face_detector() sp = dlib.shape_predictor(predictor_path) facerec = face_recognition_model_v1(face_rec_model_path) embeddings = [] embeddings_timestamps = [] landmarks_parts = [] landmarks_rect = [] embeddings_pickle = os.path.join(tmp_dir, "embeddings.npy") embeddings_timestamps_pickle = os.path.join(tmp_dir, "embeddings_timestamps.npy") if not os.path.isfile(embeddings_pickle) or not os.path.isfile( embeddings_timestamps_pickle): for frame_no, f in enumerate(images): print("Processing file: {}".format(f)) img = io.imread(f) dets = detector(img, 1) print("Number of faces detected: {}".format(len(dets))) for k, d in enumerate(dets): shape = sp(img, d) face_descriptor = facerec.compute_face_descriptor(img, shape) embeddings.append(face_descriptor) embeddings_timestamps.append(timestamps[frame_no]) landmarks_parts.append(shape.parts()) landmarks_rect.append(shape.rect) embeddings = numpy.array(embeddings) embeddings_timestamps = numpy.array(embeddings_timestamps) numpy.save(embeddings_pickle, embeddings) numpy.save(embeddings_timestamps_pickle, embeddings_timestamps) else: embeddings = numpy.load(embeddings_pickle) embeddings_timestamps = numpy.load(embeddings_timestamps_pickle) print(embeddings.shape) print(embeddings_timestamps.shape) if len(embeddings) == 0: Util.write_audacity_labels([], os.path.join( lbls_dir, youtube_video_id + ".image.txt")) return kmeans = KMeans(n_clusters=faces) kmeans.fit(embeddings) predictions = numpy.array(kmeans.labels_.tolist()) df = pd.DataFrame({ "timestamps": embeddings_timestamps.tolist(), "predictions": predictions }) timestamps = [] classes = [] for key, group in df.groupby(['timestamps']): timestamps.append(key) classes.append(",".join( [str(i) for i in sorted(group['predictions'].tolist())])) lbls = Util.generate_labels_from_classifications(classes, timestamps) Util.write_audacity_labels( lbls, os.path.join(lbls_dir, youtube_video_id + ".image.txt"))
def generate_audio_based_segmentation(audio_file, w, h, embedding_size, lstm_nodes, dropout, weights_filename, scaler_filename, window_size, step, hop_size, youtube_video_id, lbls_dir, clusters=4, sr=16000): vad = Vad() vad_lbls = vad.detect_voice_segments(audio_file) with open(scaler_filename, 'rb') as f: scaler = pickle.load(f) model, intermediate = get_lstm_siamese((w, h), embedding_size, lstm_nodes, dropout) model.load_weights(weights_filename) feature_extractor = FeatExtractorMFCC(window_size, hop_size, w, sr, h, step=step) X, timestamps = feature_extractor.extract(audio_file) timestamps = numpy.array(timestamps) window = timestamps[1] - timestamps[0] frame_predictions = [] for k, timestamp in enumerate(timestamps): found = False for lbl in vad_lbls: if lbl.start_seconds <= timestamp <= lbl.end_seconds - window: # need the window end to fit in the label frame_predictions.append(lbl.label) found = True break if not found: frame_predictions.append('non_speech') frame_predictions = numpy.array(frame_predictions) print(frame_predictions.shape) print(timestamps.shape) speech_indices = numpy.where(frame_predictions == 'speech') X_speech = X[speech_indices] timestamps_speech = timestamps[speech_indices] # Util.write_audacity_labels(Util.generate_labels_from_classifications(frame_predictions, timestamps), # "vad_preds_quant.txt") X = X_speech.reshape((X_speech.shape[0] * w, h)) X = scaler.transform(X) X = X.reshape(-1, w, h) original_embeddings = intermediate.predict(X) clustering_algorithm = GaussianMixture(n_components=clusters, max_iter=1000, n_init=3) # if visualise: # # embeddings, y, classes, new_timestamps = Util.get_annotated_data_x_y(timestamps_speech, original_embeddings, # lbls_fixed) # le = preprocessing.LabelEncoder() # y = le.fit_transform(y) # # tsne = TSNE() # two_dimensional = tsne.fit_transform(embeddings) # # # pca = PCA(n_components=2) # # two_dimensional = pca.fit_transform(embeddings) # # # pca2 = PCA(n_components=20) # # pca_embeddings = pca2.fit_transform(embeddings) # # clustering_algorithm.fit(two_dimensional) # predictions = clustering_algorithm.predict(two_dimensional) # # # kmeans = KMeans(n_clusters=CLUSTERS) # # kmeans.fit(embeddings) # # plt.figure(figsize=(10, 6)) # plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=y, marker='.') # # plt.figure(figsize=(10, 6)) # plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=predictions, marker='.') # # else: tsne = TSNE(n_components=2, init='pca') two_dimensional = tsne.fit_transform(original_embeddings) # original_embeddings = scale((original_embeddings)) # pca = PCA(n_components=2) # two_dimensional = pca.fit_transform(original_embeddings) # pca2 = PCA(n_components=3) # pca_embeddings = pca2.fit_transform(original_embeddings) clustering_algorithm.fit(two_dimensional) predictions = clustering_algorithm.predict(two_dimensional) # kmeans = KMeans(n_clusters=CLUSTERS) # kmeans.fit(two_dimensional) # plt.figure(figsize=(10,10)) # plt.imshow(calculate_similarity_matrix(two_dimensional, metric='euclidean'), cmap='gray') # plt.figure(figsize=(10,6)) # plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=predictions, marker='.') # plt.figure(figsize=(10,6)) # plt.scatter(two_dimensional[:, 0], pca_embeddings[:, 1], c=kmeans.labels_.tolist(), marker='.') # predictions = kmeans.labels_.tolist() for k, speech_index in enumerate(speech_indices[0]): frame_predictions[speech_index] = predictions[k] lbls = Util.generate_labels_from_classifications(frame_predictions, timestamps) json_lbls = [] for lbl in lbls: json_lbls.append({ "start_seconds": lbl.start_seconds, "end_seconds": lbl.end_seconds, "label": lbl.label }) with open(os.path.join(lbls_dir, youtube_video_id + ".json"), 'w') as outfile: json.dump(json_lbls, outfile) Util.write_audacity_labels( lbls, os.path.join(lbls_dir, youtube_video_id + ".txt"))