Exemplo n.º 1
0
def perform_clustering(embeddings, time_stamps, speakers, audio_rttm_map,
                       out_rttm_dir):
    """
    performs spectral clustering on embeddings with time stamps generated from VAD output
    Args:
    
    embeddings (dict): Embeddings with key as unique_id
    time_stamps (dict): time stamps list for each audio recording
    speakers (dict): number of speaker for each audio recording 
    audio_rttm_map (dict): AUDIO_RTTM_MAP for mapping unique id with audio file path and rttm path
    out_rttm_dir (str): Path to write predicted rttms
    
    Returns:
    all_reference (list[Annotation]): reference annotations for score calculation
    all_hypothesis (list[Annotation]): hypothesis annotations for score calculation

    """
    all_hypothesis = []
    all_reference = []
    no_references = False

    for uniq_key in embeddings.keys():
        NUM_speakers = speakers[uniq_key]
        if NUM_speakers >= 2:
            emb = embeddings[uniq_key]
            emb = np.asarray(emb)

            cluster_method = SpectralClusterer(min_clusters=NUM_speakers,
                                               max_clusters=NUM_speakers)
            cluster_labels = cluster_method.predict(emb)

            lines = time_stamps[uniq_key]
            assert len(cluster_labels) == len(lines)
            for idx, label in enumerate(cluster_labels):
                tag = 'speaker_' + str(label)
                lines[idx] += tag

            a = get_contiguous_stamps(lines)
            labels = merge_stamps(a)
            if out_rttm_dir:
                labels_to_rttmfile(labels, uniq_key, out_rttm_dir)
            hypothesis = labels_to_pyannote_object(labels)
            all_hypothesis.append(hypothesis)

            rttm_file = audio_rttm_map[uniq_key]['rttm_path']
            if os.path.exists(rttm_file) and not no_references:
                ref_labels = rttm_to_labels(rttm_file)
                reference = labels_to_pyannote_object(ref_labels)
                all_reference.append(reference)
            else:
                no_references = True
                all_reference = []

    return all_reference, all_hypothesis
Exemplo n.º 2
0
def cluster_seg(order_list, min_cluster, max_cluster, n_features, n_mfcc):
    """
    Clustering small audio files based on their feature vectors (MFCCs) for speaker verification
    Args:
        order_list: list of splited audio files
        min_cluster: minimum number of cluster
        max_cluster: maximum number of cluster
        n_features: No. feature be chose for clustering
        n_mfcc: No. of MFCCs features
    Return:
        labels: Speaker verification for each audio files.
    """
    # Create an empty matrix with fixed-size (n_sample, n_feature)
    n_feat = np.zeros((len(order_list), n_features * n_mfcc))

    for idx, audio in enumerate(order_list):
        data, sr = li.load(audio, 16000)
        # Calculating MFCCs features
        mfccs = li.feature.mfcc(data, sr, n_mfcc=n_mfcc)
        mfccs = np.array(mfccs)
        # Resize MFCCs and flatten
        mfccs = np.resize(mfccs, (n_mfcc, n_features))
        mfccs = mfccs.flatten()

        n_feat[idx, :] = mfccs
    print(n_feat.shape)
    # Apply K-means to cluster audio files by their features vectors (MFCCs)
    clusterer = SpectralClusterer(min_clusters=min_cluster,
                                  max_clusters=max_cluster)
    labels = clusterer.predict(n_feat)
    labels = [str(x) for x in labels]
    # Convert from cluster labels to speaker identification
    speaker_tag = {}
    n = 1
    speaker_tag[labels[0]] = f'Speaker {n}: '
    for s in labels:
        if s not in speaker_tag:
            n += 1
            speaker_tag[s] = f'Speaker {n}: '
    labels = [speaker_tag[i] for i in labels]

    return labels
def signalLabelPrediction(cont_embeds):
    # Finding optimum clusters c
    wcss = []
    r = range(1, 15)
    for k in r:
        km = KMeans(n_clusters=k)
        km = km.fit(cont_embeds)
        wcss.append(km.inertia_)
    kn = KneeLocator(list(r), wcss, S=1.0, curve='convex', direction='decreasing')
    c=kn.knee

    # Passing the clusters to get the labels
    clusterer = SpectralClusterer(
        min_clusters=c,
        max_clusters=100,
        p_percentile=0.90,
        gaussian_blur_sigma=1)

    labels = clusterer.predict(cont_embeds)
    return(labels)
Exemplo n.º 4
0
def clustering(wav):
    encoder = VoiceEncoder("cpu")
    _, cont_embeds, wav_splits = encoder.embed_utterance(
        wav, return_partials=True, rate=16)  #create d-vectors
    clusterer = SpectralClusterer(min_clusters=2,
                                  max_clusters=100,
                                  p_percentile=0.90,
                                  gaussian_blur_sigma=1)

    labels = clusterer.predict(cont_embeds)
    times = [((s.start + s.stop) / 2) / sampling_rate for s in wav_splits]
    labelling = []
    start_time = 0

    for i, time in enumerate(times):
        if i > 0 and labels[i] != labels[i - 1]:
            temp = ['speaker ' + str(labels[i - 1]), start_time, time]
            labelling.append(tuple(temp))
            start_time = time
        if i == len(times) - 1:
            temp = ['speaker ' + str(labels[i]), start_time, time]
            labelling.append(tuple(temp))
    return labelling  #str
Exemplo n.º 5
0
#    min_clusters=2,
#    max_clusters=3,
#    p_percentile=0.99,
#    gaussian_blur_sigma=3,
#    thresholding_soft_multiplier=0.5,
#    stop_eigenvalue=10e-5)

clusterer = SpectralClusterer(
    min_clusters=2,
    max_clusters=3,
    p_percentile=0.99,
    gaussian_blur_sigma=3,
    thresholding_soft_multiplier=0.5,
    stop_eigenvalue=10e-5)

labels, embeddings, centers = clusterer.predict(X_cluster)

#### SAVING RESULTS

np.savetxt(dir_audio + 'labels.npy', labels)
with open(dir_audio + 'order_pred.txt', "w") as f:
    for s in order_pred:
        f.write(str(s) +"\n")
        
#### PLOTTING CLUSTER

plt.figure(1)
plt.title('Spectral Cluster prediction')
plt.scatter(embeddings[:, 0], embeddings[:, 1],c=labels, s=50, cmap='viridis')
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.5)
plt.ylabel('Embeddings 1') 
Exemplo n.º 6
0
autotune = AutoTune(p_percentile_min=0.60,
                    p_percentile_max=0.95,
                    init_search_step=0.01,
                    search_level=3)

icassp2018_clusterer = SpectralClusterer(
    min_clusters=2,
    max_clusters=18,
    autotune=None,
    laplacian_type=None,
    refinement_options=icassp2018_refinement_options,
    custom_dist="cosine")

for idx, sample_id in enumerate(sample_ids):
    labels = icassp2018_clusterer.predict(sequences[idx])
    print('Predicted labels: ', sample_id, f' {idx+1}/{len(sample_ids)}')

    annotation = Annotation()
    annotation.uri = sample_id
    for jdx, speaker_id in enumerate(labels):
        segment_interval = intervals[idx][jdx]
        annotation[Segment(segment_interval[0],
                           segment_interval[1])] = speaker_id

    rttm_file = '{}/{}.rttm'.format(rttm_dir, sample_id)
    with open(rttm_file, 'w') as file:
        annotation.support().write_rttm(file)

    # rttm_file_collar = '{}/rttm_colar/{}.rttm'.format(rttm_dir, sample_id)
    # with open(rttm_file_collar, 'w') as file:
Exemplo n.º 7
0
encoder = VoiceEncoder("cpu")
_, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16)

print(cont_embeds.shape)

####################################################################################################
# D vector clustering


clusterer = SpectralClusterer(
    min_clusters=2,
    max_clusters=100,
    p_percentile=0.90,
    gaussian_blur_sigma=1)

labels = clusterer.predict(cont_embeds)

####################################################################################################
# Create Continuous Segments


def create_labelling(labels, wav_splits):
    from resemblyzer import sampling_rate
    times = [((s.start + s.stop) / 2) / sampling_rate for s in wav_splits]
    labelling = []
    start_time = 0

    for i, time in enumerate(times):
        if i > 0 and labels[i] != labels[i - 1]:
            temp = [str(labels[i - 1]), start_time, time]
            labelling.append(tuple(temp))
        for p_percentile in p_percentiles:
            for gaussian_blur_sigma in gaussian_blur_sigmas:
                for thresholding_soft_multiplier in thresholding_soft_multipliers:
                    for stop_eigenvalue in stop_eigenvalues:
                        #                        print('  Values: p_percentile=' + str(p_percentile) + ' gaussian_blur_sigma=' + str(gaussian_blur_sigma) + ' thresholding_soft_multiplier=' + str(thresholding_soft_multiplier) + ' stop_eigenvalue=' + str(stop_eigenvalue))
                        clusterer = SpectralClusterer(
                            min_clusters=2,
                            max_clusters=10,
                            p_percentile=p_percentile,
                            gaussian_blur_sigma=gaussian_blur_sigma,
                            thresholding_soft_multiplier=
                            thresholding_soft_multiplier,
                            stop_eigenvalue=stop_eigenvalue)

                        labels = clusterer.predict(X_cluster)

                        speaker_1 = []
                        speaker_2 = []
                        speaker = []

                        for i in range(len(labels)):
                            pos = int(order_list[i][8:12]) - 1
                            num = int(order_list[i][13:17]) - 1
                            ini, final = times[pos].split('\t')
                            if round(
                                    float(ini) + ((num) *
                                                  (window_t *
                                                   (1 - overlap)) + window_t),
                                    4) < float(final):
                                interval = [
Exemplo n.º 9
0
def get_full_translation(audio_file_name):
    def mp3_to_wav(file):
        if audio_file_name.split('.')[1] == 'mp3':
            sound = AudioSegment.from_mp3(audio_file_name)
            sound.export(f"{audio_file_name.split('.')[0]}.wav", format="wav")
        else:
            pass

    wav_fpath = (f"{audio_file_name.split('.')[0]}.wav")

    #preprocesses the wave file to turn it into a file without sounds
    wav = preprocess_wav(wav_fpath)
    encoder = VoiceEncoder(
        "cpu"
    )  #Creates a voice encoder object so we can process audio with the cpu
    _, cont_embeds, wav_splits = encoder.embed_utterance(wav,
                                                         return_partials=True,
                                                         rate=16)

    #create a cluster object
    clusterer = SpectralClusterer(min_clusters=2,
                                  max_clusters=100,
                                  p_percentile=0.90,
                                  gaussian_blur_sigma=1)

    #label the speaker that is speaking at certain time
    labels = clusterer.predict(cont_embeds)

    def create_labelling(labels, wav_splits):
        times = [((s.start + s.stop) / 2) / sampling_rate for s in wav_splits]
        labelling = []
        start_time = 0

        for i, time in enumerate(times):
            if i > 0 and labels[i] != labels[i - 1]:
                temp = [str(labels[i - 1]), start_time, time]
                labelling.append(tuple(temp))
                start_time = time
            if i == len(times) - 1:
                temp = [str(labels[i]), start_time, time]
                labelling.append(tuple(temp))

        return labelling

    labelling = create_labelling(labels, wav_splits)
    print(labelling)

    #read data from the wave file

    def split_audio(file_path):
        thisList = []
        n = 0
        for i in labelling:
            n = +1
            #changes the values in the tuple to ints and milliseconds
            start = int(i[1]) * 1000
            end = int(i[2]) * 1000
            #finds the audio file as a wav
            newAud = AudioSegment.from_wav(file_path)
            #creates a new audio based on the values given by seconds
            newAudio = newAud[start:end]
            newAudio.export(
                f'audio_files/split/SplitAudio_{n}.wav',
                format="wav")  #Exports to a wav file in the current path.

        return n

    n = split_audio(audio_file_name)

    def get_translation():
        entire_text = []
        for num in range(n):
            r = sr.Recognizer()

            harvard = sr.AudioFile(f'audio_files/split/SplitAudio_{num+1}.wav')
            with harvard as source:
                r.adjust_for_ambient_noise(source)
                r.enable_separate_recognition_per_channel = True

                audio = r.record(source)
                text = r.recognize_google(audio, language='en-US')

                text = f'Speaker {n}: ' + str(text)
                entire_text.append(text)
        return entire_text

    translation = get_translation()

    return render_template('translation.html', translation=translation)
Exemplo n.º 10
0
def spectral_cluster(
    vad_results,
    speaker_vector,
    min_clusters: int = None,
    max_clusters: int = None,
    p_percentile: float = 0.95,
    gaussian_blur_sigma=1.0,
    norm_function: Callable = l2_normalize,
    log_distance_metric: str = None,
    return_embedding=False,
    **kwargs,
):
    """
    Speaker diarization using SpectralCluster, https://github.com/wq2012/SpectralCluster

    Parameters
    ----------
    vad_results: List[Tuple[Frame, label]]
        results from VAD.
    speaker_vector: callable
        speaker vector object.
    min_clusters: int, optional (default=None)
        minimal number of clusters allowed (only effective if not None).
    max_clusters: int, optional (default=None)
        maximal number of clusters allowed (only effective if not None).
        can be used together with min_clusters to fix the number of clusters.
    gaussian_blur_sigma: float, optional (default=1.0)
        sigma value of the Gaussian blur operation.
    p_percentile: float, optional (default=0.95)
        the p-percentile for the row wise thresholding.
    norm_function: Callable, optional(default=malaya_speech.utils.dist.l2_normalize)
        normalize function for speaker vectors.
    log_distance_metric: str, optional (default=None)
        post distance norm in log scale metrics.

    Returns
    -------
    result : List[Tuple[Frame, label]]
    """
    try:
        from spectralcluster import SpectralClusterer

    except:
        raise ModuleNotFoundError(
            'spectralcluster not installed. Please install it by `pip install spectralcluster` and try again.'
        )

    clusterer = SpectralClusterer(
        min_clusters=min_clusters,
        max_clusters=max_clusters,
        p_percentile=p_percentile,
        gaussian_blur_sigma=gaussian_blur_sigma,
        **kwargs,
    )

    speakers, activities, mapping = [], [], {}
    for no, result in enumerate(vad_results):
        if result[1]:
            speakers.append('got')
            mapping[len(activities)] = no
            vector = speaker_vector([result[0]])[0]
            activities.append(vector)
        else:
            speakers.append('not a speaker')

    activities = np.array(activities)
    if norm_function:
        activities = norm_function(activities)

    if log_distance_metric:
        activities = compute_log_dist_matrix(activities, log_distance_metric)

    cluster_labels = clusterer.predict(activities)

    for k, v in mapping.items():
        speakers[v] = f'speaker {cluster_labels[k]}'

    results = []
    for no, result in enumerate(vad_results):
        results.append((result[0], speakers[no]))

    if return_embedding:
        return results, activities
    else:
        return results
Exemplo n.º 11
0
def Diariazation(path, filename):
    src = path
    # dst = str(filename)+".wav"
    dst = path
    # if(path.split(".")[1]!='wav'):
    #     # dst=convertmp3towav(path,filename)
    #     return {"msg":"Only Wav files are Supported"}
    # sound = AudioSegment.from_mp3(src)
    # sound.export(dst, format="wav")

    audio_file_path = dst

    wav_fpath = Path(audio_file_path)

    wav = preprocess_wav(wav_fpath)
    encoder = VoiceEncoder("cpu")
    _, cont_embeds, wav_splits = encoder.embed_utterance(wav,
                                                         return_partials=True,
                                                         rate=16)

    clusterer = SpectralClusterer(min_clusters=1,
                                  max_clusters=100,
                                  p_percentile=0.90,
                                  gaussian_blur_sigma=1)

    labels = clusterer.predict(cont_embeds)

    def create_labelling(labels, wav_splits):
        times = [((s.start + s.stop) / 2) / sampling_rate for s in wav_splits]
        labelling = []
        start_time = 0

        for i, time in enumerate(times):
            if i > 0 and labels[i] != labels[i - 1]:
                temp = [str(labels[i - 1]), start_time, time]
                labelling.append(tuple(temp))
                start_time = time
            if i == len(times) - 1:
                temp = [str(labels[i]), start_time, time]
                labelling.append(tuple(temp))

        return labelling

    labelling = create_labelling(labels, wav_splits)

    dd = defaultdict(list)

    for tpl in labelling:
        # print(type(tpl))
        # print(tpl[0])
        dd[tpl[0]].append([tpl[1], tpl[2]])

    transcript_list = defaultdict(list)

    # t1 = t1 * 1000 #Works in milliseconds
    # t2 = t2 * 1000

    split_audio_path = defaultdict(list)
    for speaker in dd.keys():
        ind = 0
        for duration in dd[speaker]:
            l = len(split_audio_path[speaker])
            t1 = duration[0] * 1000
            t2 = duration[1] * 1000

            newAudio = AudioSegment.from_wav(audio_file_path)
            newAudio = newAudio[t1:t2]

            save_path = './Audio/AudioD/user_' + str(speaker) + str(l) + '.wav'
            newAudio.export(
                save_path,
                format="wav")  #Exports to a wav file in the current path.

            split_audio_path[speaker].append(save_path)

            ind += 1

    import speech_recognition as sr
    r = sr.Recognizer()

    for speaker in split_audio_path.keys():
        for path in split_audio_path[speaker]:
            with sr.WavFile(
                    path) as source:  # use "test.wav" as the audio source
                audio = r.record(source)  # extract audio data from the file

                try:
                    transcript = r.recognize(audio)
                    print("Transcription: " + transcript
                          )  # recognize speech using Google Speech Recognition

                    transcript_list[speaker].append(transcript)
                except LookupError:  # speech is unintelligible
                    print("Could not understand audio")
    return {"intervals": dd, "list": transcript_list}
Exemplo n.º 12
0
args = parser.parse_args()

audio_path = Path(args.audio)
wav = preprocess_wav(audio_path)

encoder = VoiceEncoder() if args.rate <= 4 else VoiceEncoder("cpu")

# encoder = VoiceEncoder("cpu")
_, cond_emd, wav_splits = encoder.embed_utterance(
    wav,
    return_partials=True,
    rate=args.rate,
)

clusterer = SpectralClusterer(
    min_clusters=args.num,
    p_percentile=0.91,
)


labels = clusterer.predict(cond_emd)
times = np.array([(s.start + s.stop) / 2 / sampling_rate for s in wav_splits])


if args.interactive:
    interactive_diarization(times, labels, wav, labels.max() + 1)
else:
    time_intervals = get_time_intervals(times, labels)
    log_speaker_diary(time_intervals)