def main():
    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument('--input-file', dest='input_file', required=True)
    args = parser.parse_args()

    input_dir = os.path.split(args.input_file)[0]
    temp_file = input_dir + "/temp.wav"

    cmd = [
        "/usr/bin/ffmpeg", "-i", args.input_file, "-ar", "22050", "-ac", "1",
        "-acodec", "pcm_s16le", temp_file, "-y"
    ]
    subprocess.check_call(cmd)

    cmd = ["yaafe", "-c", FEATURE_PLAN, "-r", "22050", temp_file]

    subprocess.check_output(cmd)

    features1 = ["zcr", "flux", "spectral_rollof", "energy_stats"]
    features2 = ["mfcc_stats"]
    features3 = ["spectral_flatness_per_band"]
    features4 = features1 + features2 + features3

    FEATURE_GROUPS = [features1, features2, features3, features4]

    peaks, convolution_values, timestamps = feat.get_combined_peaks(
        temp_file, FEATURE_GROUPS, kernel_type="gaussian")
    detected_segments = kernel.calculate_segment_start_end_times_from_peak_positions(
        peaks, timestamps)

    timestamps, feature_vectors = feat.read_features(features4,
                                                     temp_file,
                                                     scale=True)

    with open("/opt/speech-music-discrimination/pickled/model.pickle",
              'r') as f:
        trained_model = pickle.load(f)

    frame_level_predictions = trained_model.predict(feature_vectors)

    annotated_segments = Util.get_annotated_labels_from_predictions_and_sm_segments(
        frame_level_predictions, detected_segments, timestamps)

    annotated_segments = Util.combine_adjacent_labels_of_the_same_class(
        annotated_segments)
    annotated_segments = feat.filter_noisy_labels(annotated_segments)
    annotated_segments = Util.combine_adjacent_labels_of_the_same_class(
        annotated_segments)

    Util.write_audacity_labels(annotated_segments,
                               input_dir + "/annotated-segments.txt")

    for f in glob.glob(input_dir + "/*.csv"):
        os.remove(f)

    os.remove(temp_file)
def main():

    subprocess.check_output(["unzip", "muspeak-mirex2015-detection-examples.zip", "-d",
                             "muspeak-mirex2015-detection-examples"])

    mp3_to_wav()

    wav_files = glob.glob("./muspeak-mirex2015-detection-examples/*.wav")

    for wav_file in wav_files:
        print wav_file
        label_file = wav_file.replace(".mp3.wav", ".csv")
        if not os.path.isfile(label_file):
            label_file = label_file.replace(".csv", "_v2.csv")
        WavEditor.create_audio_segments(label_file, wav_file, "segments", True, ",", "f2", remove_overlapping=True)

    speech_wavs = glob.glob("./segments/*_s.wav")
    music_wavs = glob.glob("./segments/*_m.wav")

    all_files_dict = {}

    for f in speech_wavs:
        all_files_dict[f] = "s"

    for f in music_wavs:
        all_files_dict[f] = "m"

    random.seed(2222)
    all_files_random_keys = random.sample(all_files_dict.keys(), len(all_files_dict.keys()))

    last_seconds = 0
    files_to_concatenate = []

    labels = []
    for v in all_files_random_keys:
        duration = float(subprocess.check_output(["soxi", "-D", v]).strip())
        segment_start_time = last_seconds
        segment_end_time = last_seconds + duration
        last_seconds += duration
        labels.append(AudacityLabel(segment_start_time, segment_end_time, all_files_dict[v]))
        files_to_concatenate.append(v)

    audacity_labels = Util.combine_adjacent_labels_of_the_same_class(labels)
    Util.write_audacity_labels(audacity_labels, "mirex_combined.txt")

    command = []
    command.append("sox")
    command.extend(files_to_concatenate)
    command.append("mirex_combined.wav")
    subprocess.check_output(command)

    shutil.rmtree("./segments")
    shutil.rmtree("./muspeak-mirex2015-detection-examples")
def calculate_fusion(youtube_video_id,
                     lbls_dir,
                     audio_lbls,
                     image_lbls,
                     duration,
                     step=0.1,
                     neighbours_before_after=6,
                     times_greater=2):  # 100ms

    pairs, timestamps = create_pairs(audio_lbls, image_lbls, duration, step)
    mapping_face_to_voice = detect_face_voice_mapping(pairs)
    # print(mapping_face_to_voice)
    pairs = apply_mapping_to_pairs(pairs, mapping_face_to_voice)
    # print(pairs)
    seconds_of_mismatch = 0
    for k, pair in enumerate(pairs):
        if pair.image_class is None:
            # when image is None
            continue
        classes = pair.image_class.split(",")
        # if only one face has been detected then assume it's the face of the speaker
        if len(classes) == 1 and pair.audio_class != 'non_speech':
            if pair.image_class != pair.audio_class:
                seconds_of_mismatch += step
                # print("%s != %s" % (pair.image_class, pair.audio_class))
                nearest_neighbour_class = find_nearest_neighbours_class(
                    k,
                    pairs,
                    neighbours_before_after=neighbours_before_after,
                    times_greater=times_greater)
                pair.audio_class = nearest_neighbour_class

    lbls = Util.generate_labels_from_classifications(
        [p.audio_class for p in pairs], timestamps)
    lbls = list(filter(lambda x: x.label is not None, lbls))

    Util.write_audacity_labels(
        lbls, os.path.join(lbls_dir, youtube_video_id + ".fusion.txt"))
    return mapping_face_to_voice
Пример #4
0
def calculate_fusion(youtube_video_id, lbls_dir, audio_lbls, image_lbls, duration, step=0.1): # 100ms

    pairs, timestamps = create_pairs(audio_lbls, image_lbls, duration, step)
    mapping_face_to_voice = detect_face_voice_mapping(pairs)
    print(mapping_face_to_voice)
    pairs = apply_mapping_to_pairs(pairs, mapping_face_to_voice)
    print(pairs)

    for k, pair in enumerate(pairs):
        if pair.image_class is None:
            # when image is None
            continue
        classes = pair.image_class.split(",")
        if len(classes) == 1 and pair.audio_class != 'non_speech':
            if pair.image_class != pair.audio_class:
                # print("%s != %s" % (pair.image_class, pair.audio_class))
                nearest_neighbour_class = find_nearest_neighbours_class(k, pairs)
                pair.audio_class = nearest_neighbour_class

    print(pairs)

    lbls = Util.generate_labels_from_classifications([p.audio_class for p in pairs], timestamps)
    lbls = filter(lambda x: x.label is not None, lbls)

    json_lbls = []
    for lbl in lbls:
        json_lbls.append({
            "start_seconds": lbl.start_seconds,
            "end_seconds": lbl.end_seconds,
            "label": lbl.label
        })
    with open(os.path.join(lbls_dir, youtube_video_id + ".json"), 'w') as outfile:
        json.dump(json_lbls, outfile)

    Util.write_audacity_labels(lbls, os.path.join(lbls_dir, youtube_video_id + ".txt"))
    return mapping_face_to_voice
Пример #5
0
def generate_audio_based_segmentation(audio_file,
                                      w,
                                      h,
                                      embedding_size,
                                      lstm_nodes,
                                      dropout,
                                      weights_filename,
                                      scaler_filename,
                                      window_size,
                                      step,
                                      hop_size,
                                      youtube_video_id,
                                      lbls_dir,
                                      clusters=4,
                                      sr=16000):
    vad = Vad()
    vad_lbls = vad.detect_voice_segments(audio_file)

    # model.load_weights(weights_filename)
    # feature_extractor = FeatExtractorMFCC(window_size, hop_size, w, sr, h, step=step)
    # X, timestamps = feature_extractor.extract(audio_file)
    # timestamps = numpy.array(timestamps)

    y, sr = librosa.load(audio_file, sr=sr)
    X = mfcc(y, sr=sr, n_mfcc=h, n_fft=window_size, hop_length=hop_size)

    timestamps = [k * hop_size / sr for k in range(0, X.shape[1])]
    timestamps = numpy.array(timestamps)
    window = timestamps[1] - timestamps[0]

    frame_predictions = []

    for k, timestamp in enumerate(timestamps):
        found = False
        for lbl in vad_lbls:
            if lbl.start_seconds <= timestamp <= lbl.end_seconds - window:  # need the window end to fit in the label
                frame_predictions.append(lbl.label)
                found = True
                break
        if not found:
            frame_predictions.append('non_speech')

    frame_predictions = numpy.array(frame_predictions)

    speech_indices = numpy.where(frame_predictions == 'speech')

    X_speech = X[:, speech_indices]
    X_speech = X_speech.reshape((X_speech.shape[0], X_speech.shape[2]))
    print(X_speech.shape)
    X_speech = X_speech.transpose()
    print(X_speech.shape)
    timestamps_speech = timestamps[speech_indices]

    # Util.write_audacity_labels(Util.generate_labels_from_classifications(frame_predictions, timestamps),
    #                            "vad_preds_quant.txt")

    # X = X_speech.reshape((X_speech.shape[0] * w, h))
    # X = scaler.transform(X)
    # X = X.reshape(-1, w, h)

    # original_embeddings = intermediate.predict(X)

    clustering_algorithm = GaussianMixture(n_components=clusters,
                                           max_iter=1000,
                                           n_init=3)

    # if visualise:
    #
    #     embeddings, y, classes, new_timestamps = Util.get_annotated_data_x_y(timestamps_speech, original_embeddings,
    #                                                                          lbls_fixed)
    #     le = preprocessing.LabelEncoder()
    #     y = le.fit_transform(y)
    #
    #     tsne = TSNE()
    #     two_dimensional = tsne.fit_transform(embeddings)
    #
    #     #         pca = PCA(n_components=2)
    #     #         two_dimensional = pca.fit_transform(embeddings)
    #
    #     #         pca2 = PCA(n_components=20)
    #     #         pca_embeddings = pca2.fit_transform(embeddings)
    #
    #     clustering_algorithm.fit(two_dimensional)
    #     predictions = clustering_algorithm.predict(two_dimensional)
    #
    #     #        kmeans = KMeans(n_clusters=CLUSTERS)
    #     #        kmeans.fit(embeddings)
    #
    #     plt.figure(figsize=(10, 6))
    #     plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=y, marker='.')
    #
    #     plt.figure(figsize=(10, 6))
    #     plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=predictions, marker='.')
    #
    # else:
    tsne = TSNE(n_components=2, init='pca')
    two_dimensional = tsne.fit_transform(X_speech)

    #         original_embeddings = scale((original_embeddings))

    #         pca = PCA(n_components=2)
    #         two_dimensional = pca.fit_transform(original_embeddings)

    #         pca2 = PCA(n_components=3)
    #         pca_embeddings = pca2.fit_transform(original_embeddings)

    clustering_algorithm.fit(two_dimensional)
    predictions = clustering_algorithm.predict(two_dimensional)

    #         kmeans = KMeans(n_clusters=CLUSTERS)
    #         kmeans.fit(two_dimensional)

    # plt.figure(figsize=(10,10))
    # plt.imshow(calculate_similarity_matrix(two_dimensional, metric='euclidean'), cmap='gray')

    # plt.figure(figsize=(10,6))
    # plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=predictions, marker='.')

    #         plt.figure(figsize=(10,6))
    #         plt.scatter(two_dimensional[:, 0], pca_embeddings[:, 1], c=kmeans.labels_.tolist(), marker='.')

    #         predictions = kmeans.labels_.tolist()

    for k, speech_index in enumerate(speech_indices[0]):
        frame_predictions[speech_index] = predictions[k]

    lbls = Util.generate_labels_from_classifications(frame_predictions,
                                                     timestamps)

    json_lbls = []
    for lbl in lbls:
        json_lbls.append({
            "start_seconds": lbl.start_seconds,
            "end_seconds": lbl.end_seconds,
            "label": lbl.label
        })
    with open(os.path.join(lbls_dir, youtube_video_id + ".json"),
              'w') as outfile:
        json.dump(json_lbls, outfile)

    Util.write_audacity_labels(
        lbls, os.path.join(lbls_dir, youtube_video_id + ".txt"))


# if __name__ == '__main__':
#     generate_audio_based_segmentation(
#         "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/audios/Unamij6z1io.wav",
#         15, 20, 256, 128, 0.2,
#         os.path.abspath('models/weights.h5'),
#         os.path.abspath('models/scaler.pickle'),
#         1024, 3, 1024, "xxx",
#         "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/static/lbls/mfcc",
#         clusters=4
#     )
Пример #6
0
def generate_face_based_segmentation(youtube_video_id, images_dir, lbls_dir,
                                     faces, predictor_path,
                                     face_rec_model_path):

    images_raw = glob(os.path.join(images_dir, "*.jpg"))
    images_raw.sort()
    # images_raw = images_raw[0:100]
    images = [
        images_raw[i] for i in range(0, len(images_raw), FRAMES_PER_STEP)
    ]
    print(images)
    timestamps = [i * (FRAMES_PER_STEP / 25.0) for i in range(0, len(images))]
    print(timestamps)

    detector = dlib.get_frontal_face_detector()
    sp = dlib.shape_predictor(predictor_path)
    facerec = dlib.face_recognition_model_v1(face_rec_model_path)

    embeddings = []
    embeddings_timestamps = []
    landmarks_parts = []
    landmarks_rect = []

    for frame_no, f in enumerate(images):
        print("Processing file: {}".format(f))
        img = io.imread(f)

        dets = detector(img, 1)
        print("Number of faces detected: {}".format(len(dets)))

        for k, d in enumerate(dets):
            shape = sp(img, d)
            face_descriptor = facerec.compute_face_descriptor(img, shape)
            embeddings.append(face_descriptor)
            embeddings_timestamps.append(timestamps[frame_no])
            landmarks_parts.append(shape.parts())
            landmarks_rect.append(shape.rect)

    embeddings = numpy.array(embeddings)
    embeddings_timestamps = numpy.array(embeddings_timestamps)

    print(embeddings.shape)
    print(embeddings_timestamps.shape)

    if len(embeddings) == 0:
        Util.write_audacity_labels([],
                                   os.path.join(lbls_dir,
                                                youtube_video_id + ".txt"))
        return

    kmeans = KMeans(n_clusters=faces)
    kmeans.fit(embeddings)

    predictions = numpy.array(kmeans.labels_.tolist())
    df = pd.DataFrame({
        "timestamps": embeddings_timestamps.tolist(),
        "predictions": predictions
    })

    timestamps = []
    classes = []

    for key, group in df.groupby(['timestamps']):
        timestamps.append(key)
        classes.append(",".join(
            [str(i) for i in sorted(group['predictions'].tolist())]))

    lbls = Util.generate_labels_from_classifications(classes, timestamps)
    json_lbls = []
    for lbl in lbls:
        json_lbls.append({
            "start_seconds": lbl.start_seconds,
            "end_seconds": lbl.end_seconds,
            "label": lbl.label
        })
    with open(os.path.join(lbls_dir, youtube_video_id + ".json"),
              'w') as outfile:
        json.dump(json_lbls, outfile)

    Util.write_audacity_labels(
        lbls, os.path.join(lbls_dir, youtube_video_id + ".txt"))


# if __name__ == '__main__':
#
#     extract_images_from_video("/Users/nicktgr15/workspace/speaker_diarisation_poc/src/videos/Unamij6z1io.mp4",
#                               "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/video_frames")
#
#     generate_face_based_segmentation(
#         "Unamij6z1io",
#         "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/video_frames/Unamij6z1io",
#         "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/static/lbls/image",
#         4,
#         "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/models/shape_predictor_68_face_landmarks.dat",
#         "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/models/dlib_face_recognition_resnet_model_v1.dat"
#     )
    all_files_dict[f] = "m"

random.seed(1111)
all_files_random_keys = random.sample(all_files_dict.keys(),
                                      len(all_files_dict.keys()))

last_seconds = 0
files_to_concatenate = []

labels = []
for v in all_files_random_keys:
    duration = float(subprocess.check_output(["soxi", "-D", v]).strip())
    segment_start_time = last_seconds
    segment_end_time = last_seconds + duration
    last_seconds += duration
    labels.append(
        AudacityLabel(segment_start_time, segment_end_time, all_files_dict[v]))
    files_to_concatenate.append(v)

audacity_labels = Util.combine_adjacent_labels_of_the_same_class(labels)
Util.write_audacity_labels(audacity_labels, "gtzan_combined.txt")

command = []
command.append("sox")
command.extend(files_to_concatenate)
command.append("gtzan_combined.wav")
subprocess.check_output(command)

subprocess.call(['chmod', '-R', '777', './music_speech'])
shutil.rmtree("./music_speech")
    all_files_dict[f] = "m"

random.seed(1111)
all_files_random_keys = random.sample(all_files_dict.keys(),
                                      len(all_files_dict.keys()))

last_seconds = 0
files_to_concatenate = []

labels = []
for v in all_files_random_keys:
    duration = float(subprocess.check_output(["soxi", "-D", v]).strip())
    segment_start_time = last_seconds
    segment_end_time = last_seconds + duration
    last_seconds += duration
    labels.append(
        AudacityLabel(segment_start_time, segment_end_time, all_files_dict[v]))
    files_to_concatenate.append(v)

audacity_labels = Util.combine_adjacent_labels_of_the_same_class(labels)
Util.write_audacity_labels(audacity_labels, "labrosa_combined.txt")

command = []
command.append("sox")
command.extend(files_to_concatenate)
command.append("labrosa_combined.wav")
subprocess.check_output(command)

subprocess.call(['chmod', '-R', '777', './music-speech'])
shutil.rmtree("./music-speech")
def generate_audio_based_segmentation(audio_file,
                                      w,
                                      h,
                                      embedding_size,
                                      lstm_nodes,
                                      dropout,
                                      weights_filename,
                                      scaler_filename,
                                      window_size,
                                      step,
                                      hop_size,
                                      youtube_video_id,
                                      lbls_dir,
                                      clusters=4,
                                      sr=16000):
    vad = Vad()
    vad_lbls = vad.detect_voice_segments(audio_file)
    with open(scaler_filename, 'rb') as f:
        scaler = pickle.load(f)

    model, intermediate = get_lstm_siamese((w, h), embedding_size, lstm_nodes,
                                           dropout)
    model.load_weights(weights_filename)
    feature_extractor = FeatExtractorMFCC(window_size,
                                          hop_size,
                                          w,
                                          sr,
                                          h,
                                          step=step)
    X, timestamps = feature_extractor.extract(audio_file)
    timestamps = numpy.array(timestamps)
    window = timestamps[1] - timestamps[0]

    frame_predictions = []

    for k, timestamp in enumerate(timestamps):
        found = False
        for lbl in vad_lbls:
            if lbl.start_seconds <= timestamp <= lbl.end_seconds - window:  # need the window end to fit in the label
                frame_predictions.append(lbl.label)
                found = True
                break
        if not found:
            frame_predictions.append('non_speech')

    frame_predictions = numpy.array(frame_predictions)
    print(frame_predictions.shape)
    print(timestamps.shape)

    speech_indices = numpy.where(frame_predictions == 'speech')

    X_speech = X[speech_indices]

    X = X_speech.reshape((X_speech.shape[0] * w, h))
    X = scaler.transform(X)
    X = X.reshape(-1, w, h)

    original_embeddings = intermediate.predict(X)

    clustering_algorithm = KMeans(n_clusters=clusters)

    reducted_embeddings = original_embeddings
    predictions = clustering_algorithm.fit_predict(reducted_embeddings)

    for k, speech_index in enumerate(speech_indices[0]):
        frame_predictions[speech_index] = predictions[k]

    lbls = Util.generate_labels_from_classifications(frame_predictions,
                                                     timestamps)

    Util.write_audacity_labels(
        lbls, os.path.join(lbls_dir, youtube_video_id + ".audio.txt"))
for f in music_wavs:
    all_files_dict[f] = "m"

random.seed(1111)
all_files_random_keys = random.sample(all_files_dict.keys(), len(all_files_dict.keys()))

last_seconds = 0
files_to_concatenate = []

labels = []
for v in all_files_random_keys:
    duration = float(subprocess.check_output(["soxi", "-D", v]).strip())
    segment_start_time = last_seconds
    segment_end_time = last_seconds + duration
    last_seconds += duration
    labels.append(AudacityLabel(segment_start_time, segment_end_time, all_files_dict[v]))
    files_to_concatenate.append(v)

audacity_labels = Util.combine_adjacent_labels_of_the_same_class(labels)
Util.write_audacity_labels(audacity_labels, "gtzan_combined.txt")

command = []
command.append("sox")
command.extend(files_to_concatenate)
command.append("gtzan_combined.wav")
subprocess.check_output(command)

subprocess.call(['chmod', '-R', '777', './music_speech'])
shutil.rmtree("./music_speech")
Пример #11
0
def generate_face_based_segmentation(youtube_video_id, images_dir, lbls_dir,
                                     faces, predictor_path,
                                     face_rec_model_path, tmp_dir):

    images_raw = glob(os.path.join(images_dir, "*.jpg"))
    images_raw.sort()
    # images_raw = images_raw[0:100]
    images = [
        images_raw[i] for i in range(0, len(images_raw), FRAMES_PER_STEP)
    ]
    print(images)
    timestamps = [i * (FRAMES_PER_STEP / 25.0) for i in range(0, len(images))]
    print(timestamps)

    detector = get_frontal_face_detector()
    sp = dlib.shape_predictor(predictor_path)
    facerec = face_recognition_model_v1(face_rec_model_path)

    embeddings = []
    embeddings_timestamps = []
    landmarks_parts = []
    landmarks_rect = []

    embeddings_pickle = os.path.join(tmp_dir, "embeddings.npy")
    embeddings_timestamps_pickle = os.path.join(tmp_dir,
                                                "embeddings_timestamps.npy")

    if not os.path.isfile(embeddings_pickle) or not os.path.isfile(
            embeddings_timestamps_pickle):

        for frame_no, f in enumerate(images):
            print("Processing file: {}".format(f))
            img = io.imread(f)

            dets = detector(img, 1)
            print("Number of faces detected: {}".format(len(dets)))

            for k, d in enumerate(dets):
                shape = sp(img, d)
                face_descriptor = facerec.compute_face_descriptor(img, shape)
                embeddings.append(face_descriptor)
                embeddings_timestamps.append(timestamps[frame_no])
                landmarks_parts.append(shape.parts())
                landmarks_rect.append(shape.rect)

        embeddings = numpy.array(embeddings)
        embeddings_timestamps = numpy.array(embeddings_timestamps)
        numpy.save(embeddings_pickle, embeddings)
        numpy.save(embeddings_timestamps_pickle, embeddings_timestamps)
    else:
        embeddings = numpy.load(embeddings_pickle)
        embeddings_timestamps = numpy.load(embeddings_timestamps_pickle)

    print(embeddings.shape)
    print(embeddings_timestamps.shape)

    if len(embeddings) == 0:
        Util.write_audacity_labels([],
                                   os.path.join(
                                       lbls_dir,
                                       youtube_video_id + ".image.txt"))
        return

    kmeans = KMeans(n_clusters=faces)
    kmeans.fit(embeddings)

    predictions = numpy.array(kmeans.labels_.tolist())
    df = pd.DataFrame({
        "timestamps": embeddings_timestamps.tolist(),
        "predictions": predictions
    })

    timestamps = []
    classes = []

    for key, group in df.groupby(['timestamps']):
        timestamps.append(key)
        classes.append(",".join(
            [str(i) for i in sorted(group['predictions'].tolist())]))

    lbls = Util.generate_labels_from_classifications(classes, timestamps)

    Util.write_audacity_labels(
        lbls, os.path.join(lbls_dir, youtube_video_id + ".image.txt"))
def generate_audio_based_segmentation(audio_file,
                                      w,
                                      h,
                                      embedding_size,
                                      lstm_nodes,
                                      dropout,
                                      weights_filename,
                                      scaler_filename,
                                      window_size,
                                      step,
                                      hop_size,
                                      youtube_video_id,
                                      lbls_dir,
                                      clusters=4,
                                      sr=16000):
    vad = Vad()
    vad_lbls = vad.detect_voice_segments(audio_file)
    with open(scaler_filename, 'rb') as f:
        scaler = pickle.load(f)

    model, intermediate = get_lstm_siamese((w, h), embedding_size, lstm_nodes,
                                           dropout)
    model.load_weights(weights_filename)
    feature_extractor = FeatExtractorMFCC(window_size,
                                          hop_size,
                                          w,
                                          sr,
                                          h,
                                          step=step)
    X, timestamps = feature_extractor.extract(audio_file)
    timestamps = numpy.array(timestamps)
    window = timestamps[1] - timestamps[0]

    frame_predictions = []

    for k, timestamp in enumerate(timestamps):
        found = False
        for lbl in vad_lbls:
            if lbl.start_seconds <= timestamp <= lbl.end_seconds - window:  # need the window end to fit in the label
                frame_predictions.append(lbl.label)
                found = True
                break
        if not found:
            frame_predictions.append('non_speech')

    frame_predictions = numpy.array(frame_predictions)
    print(frame_predictions.shape)
    print(timestamps.shape)

    speech_indices = numpy.where(frame_predictions == 'speech')

    X_speech = X[speech_indices]
    timestamps_speech = timestamps[speech_indices]

    # Util.write_audacity_labels(Util.generate_labels_from_classifications(frame_predictions, timestamps),
    #                            "vad_preds_quant.txt")

    X = X_speech.reshape((X_speech.shape[0] * w, h))
    X = scaler.transform(X)
    X = X.reshape(-1, w, h)

    original_embeddings = intermediate.predict(X)

    clustering_algorithm = GaussianMixture(n_components=clusters,
                                           max_iter=1000,
                                           n_init=3)

    # if visualise:
    #
    #     embeddings, y, classes, new_timestamps = Util.get_annotated_data_x_y(timestamps_speech, original_embeddings,
    #                                                                          lbls_fixed)
    #     le = preprocessing.LabelEncoder()
    #     y = le.fit_transform(y)
    #
    #     tsne = TSNE()
    #     two_dimensional = tsne.fit_transform(embeddings)
    #
    #     #         pca = PCA(n_components=2)
    #     #         two_dimensional = pca.fit_transform(embeddings)
    #
    #     #         pca2 = PCA(n_components=20)
    #     #         pca_embeddings = pca2.fit_transform(embeddings)
    #
    #     clustering_algorithm.fit(two_dimensional)
    #     predictions = clustering_algorithm.predict(two_dimensional)
    #
    #     #        kmeans = KMeans(n_clusters=CLUSTERS)
    #     #        kmeans.fit(embeddings)
    #
    #     plt.figure(figsize=(10, 6))
    #     plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=y, marker='.')
    #
    #     plt.figure(figsize=(10, 6))
    #     plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=predictions, marker='.')
    #
    # else:
    tsne = TSNE(n_components=2, init='pca')
    two_dimensional = tsne.fit_transform(original_embeddings)

    #         original_embeddings = scale((original_embeddings))

    #         pca = PCA(n_components=2)
    #         two_dimensional = pca.fit_transform(original_embeddings)

    #         pca2 = PCA(n_components=3)
    #         pca_embeddings = pca2.fit_transform(original_embeddings)

    clustering_algorithm.fit(two_dimensional)
    predictions = clustering_algorithm.predict(two_dimensional)

    #         kmeans = KMeans(n_clusters=CLUSTERS)
    #         kmeans.fit(two_dimensional)

    # plt.figure(figsize=(10,10))
    # plt.imshow(calculate_similarity_matrix(two_dimensional, metric='euclidean'), cmap='gray')

    # plt.figure(figsize=(10,6))
    # plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=predictions, marker='.')

    #         plt.figure(figsize=(10,6))
    #         plt.scatter(two_dimensional[:, 0], pca_embeddings[:, 1], c=kmeans.labels_.tolist(), marker='.')

    #         predictions = kmeans.labels_.tolist()

    for k, speech_index in enumerate(speech_indices[0]):
        frame_predictions[speech_index] = predictions[k]

    lbls = Util.generate_labels_from_classifications(frame_predictions,
                                                     timestamps)

    json_lbls = []
    for lbl in lbls:
        json_lbls.append({
            "start_seconds": lbl.start_seconds,
            "end_seconds": lbl.end_seconds,
            "label": lbl.label
        })
    with open(os.path.join(lbls_dir, youtube_video_id + ".json"),
              'w') as outfile:
        json.dump(json_lbls, outfile)

    Util.write_audacity_labels(
        lbls, os.path.join(lbls_dir, youtube_video_id + ".txt"))
for f in music_wavs:
    all_files_dict[f] = "m"

random.seed(1111)
all_files_random_keys = random.sample(all_files_dict.keys(), len(all_files_dict.keys()))

last_seconds = 0
files_to_concatenate = []

labels = []
for v in all_files_random_keys:
    duration = float(subprocess.check_output(["soxi", "-D", v]).strip())
    segment_start_time = last_seconds
    segment_end_time = last_seconds + duration
    last_seconds += duration
    labels.append(AudacityLabel(segment_start_time, segment_end_time, all_files_dict[v]))
    files_to_concatenate.append(v)

audacity_labels = Util.combine_adjacent_labels_of_the_same_class(labels)
Util.write_audacity_labels(audacity_labels, "labrosa_combined.txt")

command = []
command.append("sox")
command.extend(files_to_concatenate)
command.append("labrosa_combined.wav")
subprocess.check_output(command)

subprocess.call(['chmod', '-R', '777', './music-speech'])
shutil.rmtree("./music-speech")