Exemplo n.º 1
0
def featureExtractionDirWrapper(directory, mt_win, mt_step, st_win, st_step):
    if not os.path.isdir(directory):
        raise Exception("Input path not found!")
    aF.mid_feature_extraction_file_dir(directory, mt_win, mt_step, st_win,
                                       st_step, True, True, True)
Exemplo n.º 2
0
    for audio in os.listdir(directory):
        wav_file_path = os.path.join(directory, audio)

        with sf.SoundFile(wav_file_path) as f:
            duration = (len(f) / f.samplerate)

        if (duration < 1):
            os.remove(wav_file_path)

    remove_len = len(os.listdir(directory))
    print('In total, {} audios have been removed due to short duration'.format(original_len-remove_len))

check_duration(pos_path)
check_duration(neg_path)

[mid_term_features_pos, wav_file_list_pos, mid_feature_names] =  mF.directory_feature_extraction(pos_path, 0.5,0.5, 0.05, 0.05, compute_beat=False)
[mid_term_features_neg, wav_file_list_neg, mid_feature_names] =  mF.directory_feature_extraction(neg_path, 0.5,0.5, 0.05, 0.05, compute_beat=False)

filenames_pos = []
for file in wav_file_list_pos:
    filenames_pos.append(file.split('/')[-1].split('\\')[-1].split('.')[0])

filenames_neg = []
for file in wav_file_list_neg:
    filenames_neg.append(file.split('/')[-1].split('\\')[-1].split('.')[0])

df_pos = pd.DataFrame(mid_term_features_pos, columns = mid_feature_names)
df_pos['filename'] = filenames_pos
df_pos['label'] = np.ones(len(df_pos))

df_neg = pd.DataFrame(mid_term_features_neg, columns = mid_feature_names)
Exemplo n.º 3
0
def speakerDiarization(filename, n_speakers, mt_size=2.0, mt_step=0.2, 
                       st_win=0.05, lda_dim=35, plot_res=False):
    """
    ARGUMENTS:
        - filename:        the name of the WAV file to be analyzed
        - n_speakers       the number of speakers (clusters) in
                           the recording (<=0 for unknown)
        - mt_size (opt)    mid-term window size
        - mt_step (opt)    mid-term window step
        - st_win  (opt)    short-term window size
        - lda_dim (opt     LDA dimension (0 for no LDA)
        - plot_res         (opt)   0 for not plotting the results 1 for plotting
    """
    [fs, x] = audioBasicIO.read_audio_file(filename)
    x = audioBasicIO.stereo_to_mono(x)
    duration = len(x) / fs

    [classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.load_model_knn(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/models", "knn_speaker_10"))
    [classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.load_model_knn(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/models", "knn_speaker_male_female"))

    [mt_feats, st_feats, _] = aF.mid_feature_extraction(x, fs, mt_size * fs,
                                                        mt_step * fs,
                                                        round(fs * st_win),
                                                        round(fs*st_win * 0.5))

    MidTermFeatures2 = np.zeros((mt_feats.shape[0] + len(classNames1) +
                                    len(classNames2), mt_feats.shape[1]))

    for i in range(mt_feats.shape[1]):
        cur_f1 = (mt_feats[:, i] - MEAN1) / STD1
        cur_f2 = (mt_feats[:, i] - MEAN2) / STD2
        [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
        [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
        MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i]
        MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0]+len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::, i] = P2 + 0.0001

    mt_feats = MidTermFeatures2    # TODO
    iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41,
                       42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]

    mt_feats = mt_feats[iFeaturesSelect, :]

    (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T])
    mt_feats_norm = mt_feats_norm[0].T
    n_wins = mt_feats.shape[1]

    # remove outliers:
    dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_norm.T)),
                         axis=0)
    m_dist_all = np.mean(dist_all)
    i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0]

    # TODO: Combine energy threshold for outlier removal:
    #EnergyMin = np.min(mt_feats[1,:])
    #EnergyMean = np.mean(mt_feats[1,:])
    #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    #i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0]
    #print i_non_outliers

    perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins
    mt_feats_norm_or = mt_feats_norm
    mt_feats_norm = mt_feats_norm[:, i_non_outliers]

    # LDA dimensionality reduction:
    if lda_dim > 0:
        #[mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs,
        # st_win * fs, round(fs*st_win), round(fs*st_win));
        # extract mid-term features with minimum step:
        mt_win_ratio = int(round(mt_size / st_win))
        mt_step_ratio = int(round(st_win / st_win))
        mt_feats_to_red = []
        num_of_features = len(st_feats)
        num_of_stats = 2
        #for i in range(num_of_stats * num_of_features + 1):
        for i in range(num_of_stats * num_of_features):
            mt_feats_to_red.append([])

        for i in range(num_of_features):  # for each of the short-term features:
            curPos = 0
            N = len(st_feats[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mt_win_ratio
                if N2 > N:
                    N2 = N
                curStFeatures = st_feats[i][N1:N2]
                mt_feats_to_red[i].append(np.mean(curStFeatures))
                mt_feats_to_red[i+num_of_features].append(np.std(curStFeatures))
                curPos += mt_step_ratio
        mt_feats_to_red = np.array(mt_feats_to_red)
        mt_feats_to_red_2 = np.zeros((mt_feats_to_red.shape[0] +
                                        len(classNames1) + len(classNames2),
                                         mt_feats_to_red.shape[1]))
        for i in range(mt_feats_to_red.shape[1]):
            cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1
            cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2
            [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
            [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
            mt_feats_to_red_2[0:mt_feats_to_red.shape[0], i] = mt_feats_to_red[:, i]
            mt_feats_to_red_2[mt_feats_to_red.shape[0]:mt_feats_to_red.shape[0] + len(classNames1), i] = P1 + 0.0001
            mt_feats_to_red_2[mt_feats_to_red.shape[0]+len(classNames1)::, i] = P2 + 0.0001
        mt_feats_to_red = mt_feats_to_red_2
        mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :]
        #mt_feats_to_red += np.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010
        (mt_feats_to_red, MEAN, STD) = aT.normalizeFeatures([mt_feats_to_red.T])
        mt_feats_to_red = mt_feats_to_red[0].T
        #dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0)
        #m_dist_all = np.mean(dist_all)
        #iNonOutLiers2 = np.nonzero(dist_all < 3.0*m_dist_all)[0]
        #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2]
        Labels = np.zeros((mt_feats_to_red.shape[1], ));
        LDAstep = 1.0
        LDAstepRatio = LDAstep / st_win
        #print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i*st_win/LDAstepRatio);        
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components=lda_dim)
        clf.fit(mt_feats_to_red.T, Labels)
        mt_feats_norm = (clf.transform(mt_feats_norm.T)).T

    if n_speakers <= 0:
        s_range = range(2, 10)
    else:
        s_range = [n_speakers]
    clsAll = []
    sil_all = []
    centersAll = []
    
    for iSpeakers in s_range:        
        k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers)
        k_means.fit(mt_feats_norm.T)
        cls = k_means.labels_        
        means = k_means.cluster_centers_

        # Y = distance.squareform(distance.pdist(mt_feats_norm.T))
        clsAll.append(cls)
        centersAll.append(means)
        sil_1 = []; sil_2 = []
        for c in range(iSpeakers):
            # for each speaker (i.e. for each extracted cluster)
            clust_per_cent = np.nonzero(cls == c)[0].shape[0] / \
                             float(len(cls))
            if clust_per_cent < 0.020:
                sil_1.append(0.0)
                sil_2.append(0.0)
            else:
                # get subset of feature vectors
                mt_feats_norm_temp = mt_feats_norm[:, cls==c]
                # compute average distance between samples
                # that belong to the cluster (a values)
                Yt = distance.pdist(mt_feats_norm_temp.T)
                sil_1.append(np.mean(Yt)*clust_per_cent)
                silBs = []
                for c2 in range(iSpeakers):
                    # compute distances from samples of other clusters
                    if c2 != c:
                        clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\
                                           float(len(cls))
                        MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2]
                        Yt = distance.cdist(mt_feats_norm_temp.T, 
                                            MidTermFeaturesNormTemp2.T)
                        silBs.append(np.mean(Yt)*(clust_per_cent
                                                     + clust_per_cent_2)/2.0)
                silBs = np.array(silBs)
                # ... and keep the minimum value (i.e.
                # the distance from the "nearest" cluster)
                sil_2.append(min(silBs))
        sil_1 = np.array(sil_1); 
        sil_2 = np.array(sil_2); 
        sil = []
        for c in range(iSpeakers):
            # for each cluster (speaker) compute silhouette
            sil.append( ( sil_2[c] - sil_1[c]) / (max(sil_2[c],
                                                      sil_1[c]) + 0.00001))
        # keep the AVERAGE SILLOUETTE
        sil_all.append(np.mean(sil))

    imax = np.argmax(sil_all)
    # optimal number of clusters
    nSpeakersFinal = s_range[imax]

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows:
    # this is achieved by giving them the value of their
    # nearest non-outlier window)
    cls = np.zeros((n_wins,))
    for i in range(n_wins):
        j = np.argmin(np.abs(i-i_non_outliers))        
        cls[i] = clsAll[imax][j]
        
    # Post-process method 1: hmm smoothing
    for i in range(1):
        # hmm training
        start_prob, transmat, means, cov = \
            trainHMM_computeStatistics(mt_feats_norm_or, cls)
        hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")
        hmm.startprob_ = start_prob
        hmm.transmat_ = transmat            
        hmm.means_ = means; hmm.covars_ = cov
        cls = hmm.predict(mt_feats_norm_or.T)                    
    
    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = sil_all[imax]
    class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)];


    # load ground-truth if available
    gt_file = filename.replace('.wav', '.segments')
    # if groundturh exists
    if os.path.isfile(gt_file):
        [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file)
        flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labs, mt_step)

    if plot_res:
        fig = plt.figure()    
        if n_speakers > 0:
            ax1 = fig.add_subplot(111)
        else:
            ax1 = fig.add_subplot(211)
        ax1.set_yticks(np.array(range(len(class_names))))
        ax1.axis((0, duration, -1, len(class_names)))
        ax1.set_yticklabels(class_names)
        ax1.plot(np.array(range(len(cls)))*mt_step+mt_step/2.0, cls)

    if os.path.isfile(gt_file):
        if plot_res:
            ax1.plot(np.array(range(len(flags_gt))) *
                     mt_step + mt_step / 2.0, flags_gt, 'r')
        purity_cluster_m, purity_speaker_m = \
            evaluateSpeakerDiarization(cls, flags_gt)
        print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m,
                                        100 * purity_speaker_m))
        if plot_res:
            plt.title("Cluster purity: {0:.1f}% - "
                      "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m,
                                                        100 * purity_speaker_m))
    if plot_res:
        plt.xlabel("time (seconds)")
        #print s_range, sil_all    
        if n_speakers<=0:
            plt.subplot(212)
            plt.plot(s_range, sil_all)
            plt.xlabel("number of clusters");
            plt.ylabel("average clustering's sillouette");
        plt.show()
    return cls
Exemplo n.º 4
0
def extract_features_and_train(paths,
                               mid_window,
                               mid_step,
                               short_window,
                               short_step,
                               classifier_type,
                               model_name,
                               compute_beat=False,
                               train_percentage=0.90):
    """
    This function is used as a wrapper to segment-based audio feature extraction
    and classifier training.
    ARGUMENTS:
        paths:                      list of paths of directories. Each directory
                                    contains a signle audio class whose samples
                                    are stored in seperate WAV files.
        mid_window, mid_step:       mid-term window length and step
        short_window, short_step:   short-term window and step
        classifier_type:            "svm" or "knn" or "randomforest" or
                                    "gradientboosting" or "extratrees"
        model_name:                 name of the model to be saved
    RETURNS:
        None. Resulting classifier along with the respective model
        parameters are saved on files.
    """

    # STEP A: Feature Extraction:
    features, class_names, _ = \
        aF.multiple_directory_feature_extraction(paths, mid_window, mid_step,
                                                 short_window, short_step,
                                                 compute_beat=compute_beat)

    if len(features) == 0:
        print("trainSVM_feature ERROR: No data found in any input folder!")
        return

    n_feats = features[0].shape[1]
    feature_names = ["features" + str(d + 1) for d in range(n_feats)]

    write_train_data_arff(model_name, features, class_names, feature_names)

    for i, feat in enumerate(features):
        if len(feat) == 0:
            print("trainSVM_feature ERROR: " + paths[i] +
                  " folder is empty or non-existing!")
            return

    # STEP B: classifier Evaluation and Parameter Selection:
    if classifier_type == "svm" or classifier_type == "svm_rbf":
        classifier_par = np.array([0.001, 0.01, 0.5, 1.0, 5.0, 10.0, 20.0])
    elif classifier_type == "randomforest":
        classifier_par = np.array([10, 25, 50, 100, 200, 500])
    elif classifier_type == "knn":
        classifier_par = np.array([1, 3, 5, 7, 9, 11, 13, 15])
    elif classifier_type == "gradientboosting":
        classifier_par = np.array([10, 25, 50, 100, 200, 500])
    elif classifier_type == "extratrees":
        classifier_par = np.array([10, 25, 50, 100, 200, 500])

    # get optimal classifeir parameter:
    temp_features = []
    for feat in features:
        temp = []
        for i in range(feat.shape[0]):
            temp_fv = feat[i, :]
            if (not np.isnan(temp_fv).any()) and (not np.isinf(temp_fv).any()):
                temp.append(temp_fv.tolist())
            else:
                print("NaN Found! Feature vector not used for training")
        temp_features.append(np.array(temp))
    features = temp_features

    best_param = evaluate_classifier(features, class_names, 100,
                                     classifier_type, classifier_par, 0,
                                     train_percentage)

    print("Selected params: {0:.5f}".format(best_param))

    features_norm, mean, std = normalize_features(features)
    mean = mean.tolist()
    std = std.tolist()

    # STEP C: Save the classifier to file
    if classifier_type == "svm":
        classifier = train_svm(features_norm, best_param)
    elif classifier_type == "svm_rbf":
        classifier = train_svm(features_norm, best_param, kernel='rbf')
    elif classifier_type == "randomforest":
        classifier = train_random_forest(features_norm, best_param)
    elif classifier_type == "gradientboosting":
        classifier = train_gradient_boosting(features_norm, best_param)
    elif classifier_type == "extratrees":
        classifier = train_extra_trees(features_norm, best_param)

    if classifier_type == "knn":
        feature_matrix, labels = features_to_matrix(features_norm)
        feature_matrix = feature_matrix.tolist()
        labels = labels.tolist()
        save_path = model_name
        save_parameters(save_path, feature_matrix, labels, mean, std,
                        class_names, best_param, mid_window, mid_step,
                        short_window, short_step, compute_beat)

    elif classifier_type == "svm" or classifier_type == "svm_rbf" or \
            classifier_type == "randomforest" or \
            classifier_type == "gradientboosting" or \
            classifier_type == "extratrees":
        with open(model_name, 'wb') as fid:
            cPickle.dump(classifier, fid)
        save_path = model_name + "MEANS"
        save_parameters(save_path, mean, std, class_names, mid_window,
                        mid_step, short_window, short_step, compute_beat)
Exemplo n.º 5
0
def record_audio(block_size,
                 fs=8000,
                 show_spec=False,
                 show_chroma=False,
                 log_sounds=False,
                 logs_all=False):

    # inialize recording process
    mid_buf_size = int(fs * block_size)
    pa = pyaudio.PyAudio()
    stream = pa.open(format=FORMAT,
                     channels=1,
                     rate=fs,
                     input=True,
                     frames_per_buffer=mid_buf_size)
    mid_buf = []
    count = 0
    global all_data
    global outstr
    all_data = []
    # initalize counters etc
    time_start = time.time()
    outstr = datetime.datetime.now().strftime("%Y_%m_%d_%I:%M%p")
    out_folder = outstr + "_segments"
    if log_sounds:
        if not os.path.exists(out_folder):
            os.makedirs(out_folder)
    # load segment model
    [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step,
     _] = aT.load_model("model")

    while 1:
        try:
            block = stream.read(mid_buf_size)
            count_b = len(block) / 2
            format = "%dh" % (count_b)
            shorts = struct.unpack(format, block)
            cur_win = list(shorts)
            mid_buf = mid_buf + cur_win
            del cur_win

            # time since recording started:
            e_time = (time.time() - time_start)
            # data-driven time
            data_time = (count + 1) * block_size
            x = numpy.int16(mid_buf)
            seg_len = len(x)

            # extract features
            # We are using the signal length as mid term window and step,
            # in order to guarantee a mid-term feature sequence of len 1
            [mt_feats, _,
             _] = mF.mid_feature_extraction(x, fs, seg_len, seg_len,
                                            round(fs * st_win),
                                            round(fs * st_step))
            cur_fv = (mt_feats[:, 0] - MEAN) / STD
            # classify vector:
            [res, prob] = aT.classifier_wrapper(classifier, "svm_rbf", cur_fv)
            win_class = class_names[int(res)]
            win_prob = prob[int(res)]

            if logs_all:
                all_data += mid_buf
            mid_buf = numpy.double(mid_buf)

            # Compute spectrogram
            if show_spec:
                (spec, t_axis,
                 freq_axis_s) = sF.spectrogram(mid_buf, fs, 0.050 * fs,
                                               0.050 * fs)
                freq_axis_s = numpy.array(freq_axis_s)  # frequency axis
                # most dominant frequencies (for each short-term window):
                dominant_freqs = freq_axis_s[numpy.argmax(spec, axis=1)]
                # get average most dominant freq
                max_freq = numpy.mean(dominant_freqs)
                max_freq_std = numpy.std(dominant_freqs)

            # Compute chromagram
            if show_chroma:
                (chrom, TimeAxisC,
                 freq_axis_c) = sF.chromagram(mid_buf, fs, 0.050 * fs,
                                              0.050 * fs)
                freq_axis_c = numpy.array(freq_axis_c)
                # most dominant chroma classes:
                dominant_freqs_c = freq_axis_c[numpy.argmax(chrom, axis=1)]
                # get most common among all short-term windows
                max_freqC = most_common(dominant_freqs_c)[0]

            # Plot signal window
            signalPlotCV = plotCV(
                scipy.signal.resample(mid_buf + 16000, plot_w), plot_w, plot_h,
                32000)
            cv2.imshow('Signal', signalPlotCV)
            cv2.moveWindow('Signal', 50, status_h + 50)

            # Show spectrogram
            if show_spec:
                i_spec = numpy.array(spec.T * 255, dtype=numpy.uint8)
                i_spec2 = cv2.resize(i_spec, (plot_w, plot_h),
                                     interpolation=cv2.INTER_CUBIC)
                i_spec2 = cv2.applyColorMap(i_spec2, cv2.COLORMAP_JET)
                cv2.putText(i_spec2, "max_freq: %.0f Hz" % max_freq, (0, 11),
                            cv2.FONT_HERSHEY_PLAIN, 1, (200, 200, 200))
                cv2.imshow('Spectrogram', i_spec2)
                cv2.moveWindow('Spectrogram', 50, plot_h + status_h + 60)
            # Show chromagram
            if show_chroma:
                i_chroma = numpy.array((chrom.T / chrom.max()) * 255,
                                       dtype=numpy.uint8)
                i_chroma2 = cv2.resize(i_chroma, (plot_w, plot_h),
                                       interpolation=cv2.INTER_CUBIC)
                i_chroma2 = cv2.applyColorMap(i_chroma2, cv2.COLORMAP_JET)
                cv2.putText(i_chroma2, "max_freqC: %s" % max_freqC, (0, 11),
                            cv2.FONT_HERSHEY_PLAIN, 1, (200, 200, 200))
                cv2.imshow('Chroma', i_chroma2)
                cv2.moveWindow('Chroma', 50, 2 * plot_h + status_h + 60)

            # Activity Detection:
            print("{0:.2f}\t{1:s}\t{2:.2f}".format(e_time, win_class,
                                                   win_prob))

            if log_sounds:
                # TODO: log audio files
                out_file = os.path.join(
                    out_folder,
                    "{0:.2f}_".format(e_time).zfill(8) + win_class + ".wav")
                #shutil.copyfile("temp.wav", out_file)
                wavfile.write(out_file, fs, x)

            textIm = numpy.zeros((status_h, plot_w, 3))
            statusStrTime = "time: %.1f sec" % e_time + \
                            " - data time: %.1f sec" % data_time + \
                            " - loss : %.1f sec" % (e_time - data_time)
            cv2.putText(textIm, statusStrTime, (0, 11), cv2.FONT_HERSHEY_PLAIN,
                        1, (200, 200, 200))
            cv2.putText(textIm, win_class, (0, 33), cv2.FONT_HERSHEY_PLAIN, 1,
                        (0, 0, 255))
            cv2.imshow("Status", textIm)
            cv2.moveWindow("Status", 50, 0)
            mid_buf = []
            ch = cv2.waitKey(10)
            count += 1
        except IOError:
            print("Error recording")
        raise 'Input directory is Empty'
    if not os.path.isdir(parse.audio):
        raise 'Input path is not a directory'
    if parse.groundtruth is None:
        raise 'Ground truth directory is Empty'
    if not os.path.isdir(parse.audio):
        raise 'Ground truth path is not a directory'

    files, labels = read_data(parse.audio, parse.groundtruth)

    one_hot = MultiLabelBinarizer()
    labels = one_hot.fit_transform(labels)
    class_names = [str(c) for c in one_hot.classes_]

    mid_window, mid_step, short_window, short_step = 1, 1, 0.1, 0.1
    f, fn, feature_names = mF.directory_feature_extraction(
        parse.audio, mid_window, mid_step, short_window, short_step)

    x_train, y_train, x_test, y_test = split_data(f, labels, fn)
    x_sub, y_sub = get_minority_instace(pd.DataFrame(x_train),
                                        pd.DataFrame(y_train))
    x_res, y_res = MLSMOTE(x_sub, y_sub, parse.resampled)
    print("Resampled")
    x = pd.concat([pd.DataFrame(x_train), x_res], ignore_index=True)
    y = pd.concat([pd.DataFrame(y_train), y_res], ignore_index=True)
    print('Synthetic data have been added to the train set')
    class_names = [str(c) for c in y.columns]
    print("LinearSVc Classifier")
    classifier = OneVsRestClassifier(LinearSVC(max_iter=50000,
                                               class_weight='balanced'),
                                     n_jobs=-1)
    classifier.fit(x, y)
def audio_features_extraction(dir_name="../data",
                              mt_win=1.0,
                              mt_step=1.0,
                              st_win=0.050,
                              st_step=0.050,
                              features_audio_file='Audio2Features.pkl'):

    audio_dir = dir_name + '/' + 'audio'

    # first, extract audio from video
    v2a.video2audio(dir_name)

    features = []
    file_names = []

    mid_term_features = np.array([])
    process_times = []

    # type is WAVE file, convert using the function video_to_audio.py
    suffix = ".wav"
    index_df = pd.read_csv(dir_name + '/' + 'index.csv', sep=';')

    wav_file_list, mid_feature_names = [], []

    # iterate each audio file
    print('Extracting features from audio files...')

    bar = progressbar.ProgressBar(maxval=len(index_df), \
                                  widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
    bar.start()
    bar_index = 0
    for ind in index_df.index:
        name = index_df['FILE'][ind]
        seg = str(index_df['SEG'][ind])

        file_path = audio_dir + '/' + name + '/' + seg + suffix
        # print("Analyzing file {0:d} of {1:d}: {2:s}".format(ind+1,len(index_df),file_path))

        if os.stat(file_path).st_size == 0:
            logging.warning("WARNING: EMPTY FILE -- SKIPPING")
            continue
        [sampling_rate, signal] = audioBasicIO.read_audio_file(file_path)
        if sampling_rate == 0:
            logging.warning("WARNING: NO SAMPLING RATE -- SKIPPING")
            continue

        t1 = time.clock()
        signal = audioBasicIO.stereo_to_mono(signal)
        if signal.shape[0] < float(sampling_rate) / 5:
            logging.warning("WARNING: AUDIO FILE TOO SMALL -- SKIPPING")
            continue
        wav_file_list.append(file_path)

        mid_features, _, mid_feature_names = \
            aF.mid_feature_extraction(signal, sampling_rate,
                                    round(mt_win * sampling_rate),
                                    round(mt_step * sampling_rate),
                                    round(st_win * sampling_rate),
                                    round(st_step * sampling_rate))
        mid_features = np.transpose(mid_features)
        mid_features = mid_features.mean(axis=0)
        # long term averaging of mid-term statistics
        if (not np.isnan(mid_features).any()) and \
                (not np.isinf(mid_features).any()):
            if len(mid_term_features) == 0:
                # append feature vector
                mid_term_features = mid_features
            else:
                mid_term_features = np.vstack(
                    (mid_term_features, mid_features))

        t2 = time.clock()
        duration = float(len(signal)) / sampling_rate
        process_times.append((t2 - t1) / duration)

        # update progress bar index
        bar_index += 1
        bar.update(bar_index)

    bar.finish()

    if len(process_times) > 0:
        print("Audio feature extraction completed. Complexity ratio: "
              "{0:.1f} x realtime".format(
                  (1.0 / np.mean(np.array(process_times)))))

    print('Shape: ' + str(mid_term_features.shape))

    ftr_df = pd.DataFrame(data=mid_term_features)
    df = index_df.copy()
    df = pd.concat([df, ftr_df], axis=1)
    if True:
        df.to_pickle(dir_name + '/' + features_audio_file)

    return mid_term_features, wav_file_list, mid_feature_names
Exemplo n.º 8
0
def extract_features_and_train(paths,
                               mid_window,
                               mid_step,
                               short_window,
                               short_step,
                               classifier_type,
                               model_name,
                               compute_beat=False,
                               train_percentage=0.90,
                               dict_of_ids=None,
                               use_smote=False):
    """
    This function is used as a wrapper to segment-based audio feature extraction
    and classifier training.
    ARGUMENTS:
        paths:                      list of paths of directories. Each directory
                                    contains a signle audio class whose samples
                                    are stored in seperate WAV files.
        mid_window, mid_step:       mid-term window length and step
        short_window, short_step:   short-term window and step
        classifier_type:            "svm" or "knn" or "randomforest" or
                                    "gradientboosting" or "extratrees"
        model_name:                 name of the model to be saved
        dict_of_ids:                a dictionary which has as keys the full path of audio files and as values the respective group ids
    RETURNS:
        None. Resulting classifier along with the respective model
        parameters are saved on files.
    """

    # STEP A: Feature Extraction:
    features, class_names, file_names = \
        aF.multiple_directory_feature_extraction(paths, mid_window, mid_step,
                                                 short_window, short_step,
                                                 compute_beat=compute_beat)
    file_names = [item for sublist in file_names for item in sublist]
    if dict_of_ids:
        list_of_ids = [dict_of_ids[file] for file in file_names]
    else:
        list_of_ids = None
    if len(features) == 0:
        print("trainSVM_feature ERROR: No data found in any input folder!")
        return

    n_feats = features[0].shape[1]
    feature_names = ["features" + str(d + 1) for d in range(n_feats)]

    for i, feat in enumerate(features):
        if len(feat) == 0:
            print("trainSVM_feature ERROR: " + paths[i] +
                  " folder is empty or non-existing!")
            return

    # STEP B: classifier Evaluation and Parameter Selection:
    if classifier_type == "svm" or classifier_type == "svm_rbf":
        classifier_par = np.array([0.001, 0.01, 0.5, 1.0, 5.0, 10.0, 20.0])
    elif classifier_type == "randomforest":
        classifier_par = np.array([10, 25, 50, 100, 200, 500])
    elif classifier_type == "knn":
        classifier_par = np.array([1, 3, 5, 7, 9, 11, 13, 15])
    elif classifier_type == "gradientboosting":
        classifier_par = np.array([10, 25, 50, 100, 200, 500])
    elif classifier_type == "extratrees":
        classifier_par = np.array([10, 25, 50, 100, 200, 500])

    # get optimal classifier parameter:
    temp_features = []
    for feat in features:
        temp = []
        for i in range(feat.shape[0]):
            temp_fv = feat[i, :]
            if (not np.isnan(temp_fv).any()) and (not np.isinf(temp_fv).any()):
                temp.append(temp_fv.tolist())
            else:
                print("NaN Found! Feature vector not used for training")
        temp_features.append(np.array(temp))
    features = temp_features

    best_param = evaluate_classifier(features,
                                     class_names,
                                     classifier_type,
                                     classifier_par,
                                     1,
                                     list_of_ids,
                                     n_exp=-1,
                                     train_percentage=train_percentage,
                                     smote=use_smote)

    print("Selected params: {0:.5f}".format(best_param))

    # STEP C: Train and Save the classifier to file
    # Get featues in the X, y format:
    features, labels = features_to_matrix(features)
    # Apply smote if necessary:
    if use_smote:
        sm = SMOTE(random_state=2)
        features, labels = sm.fit_resample(features, labels)

    # Use mean/std standard feature scaling:
    scaler = StandardScaler()
    features = scaler.fit_transform(features)
    mean = scaler.mean_.tolist()
    std = scaler.scale_.tolist()

    # Then train the final classifier
    if classifier_type == "svm":
        classifier = train_svm(features, labels, best_param)
    elif classifier_type == "svm_rbf":
        classifier = train_svm(features, labels, best_param, kernel='rbf')
    elif classifier_type == "randomforest":
        classifier = train_random_forest(features, labels, best_param)
    elif classifier_type == "gradientboosting":
        classifier = train_gradient_boosting(features, labels, best_param)
    elif classifier_type == "extratrees":
        classifier = train_extra_trees(features, labels, best_param)

    # And save the model to a file, along with
    # - the scaling -mean/std- vectors)
    # - the feature extraction parameters
    if classifier_type == "knn":
        feature_matrix = features.tolist()
        labels = labels.tolist()
        save_path = model_name
        save_parameters(save_path, feature_matrix, labels, mean, std,
                        class_names, best_param, mid_window, mid_step,
                        short_window, short_step, compute_beat)

    elif classifier_type == "svm" or classifier_type == "svm_rbf" or \
            classifier_type == "randomforest" or \
            classifier_type == "gradientboosting" or \
            classifier_type == "extratrees":
        with open(model_name, 'wb') as fid:
            cPickle.dump(classifier, fid)
        save_path = model_name + "MEANS"
        save_parameters(save_path, mean, std, class_names, mid_window,
                        mid_step, short_window, short_step, compute_beat)
Exemplo n.º 9
0
def create_feature_from_audio(filename):
    import pyogg
    import numpy as np
    import ctypes, numpy, pyogg
    import matplotlib.pyplot as plt
    import scipy.io.wavfile

    # https://github.com/Zuzu-Typ/PyOgg/issues/19
    # file = pyogg.OpusFile(filename)  # stereo
    # audio_path_opus = "./"
    file = pyogg.OpusFile(filename)
    target_datatype = ctypes.c_short * (file.buffer_length // 2
                                        )  # always divide by 2 for some reason
    buffer_as_array = ctypes.cast(file.buffer,
                                  ctypes.POINTER(target_datatype)).contents
    if file.channels == 1:
        wav = numpy.array(buffer_as_array)
    elif file.channels == 2:
        wav = numpy.array((wav[0::2], wav[1::2]))
    else:
        raise NotImplementedError()
    # This is the final numpy array
    signal = numpy.transpose(wav)
    sampling_rate = 48000
    print(numpy.shape(wav))

    #plt.figure
    #plt.title("Signal Wave...")
    #plt.plot(signal)
    #plt.show()

    # Calculating features from final_data
    from pyAudioAnalysis import MidTermFeatures as mF
    from pyAudioAnalysis import ShortTermFeatures as sF
    from pyAudioAnalysis import audioBasicIO

    mid_window = round(0.1 * sampling_rate)
    mid_step = round(0.1 * sampling_rate)
    short_window = round(sampling_rate * 0.01)
    short_step = round(sampling_rate * 0.01)

    signal = audioBasicIO.stereo_to_mono(signal)
    print(type(signal))
    # print(np.shape(signal))
    signal = signal.astype(
        'float64'
    )  # this line is because librosa was making an error - need floats

    [mid_features, short_features,
     mid_feature_names] = mF.mid_feature_extraction(signal, sampling_rate,
                                                    mid_window, mid_step,
                                                    short_window, short_step)
    mid_features = np.transpose(mid_features)
    mid_term_features = mid_features.mean(axis=0)
    mid_term_features = np.reshape(mid_term_features, (-1, 1))
    mid_term_features = np.transpose(mid_term_features)
    # print(np.shape(mid_term_features))
    # len(mid_feature_names)

    # Getting the classification result with Cough=0, No_Cough=1
    from joblib import dump, load
    from sklearn import preprocessing
    cough_classifier = load('Cough_NoCough_classifier.joblib')
    features = preprocessing.StandardScaler().fit_transform(mid_term_features)
    prediction = cough_classifier.predict(features)  # coughs=0 , no_cough = 1
    return prediction, mid_term_features
Exemplo n.º 10
0
def mid_term_file_classification(input_file, model_name, model_type,
                                 plot_results=False, gt_file=""):
    """
    This function performs mid-term classification of an audio stream.
    Towards this end, supervised knowledge is used,
    i.e. a pre-trained classifier.
    ARGUMENTS:
        - input_file:        path of the input WAV file
        - model_name:        name of the classification model
        - model_type:        svm or knn depending on the classifier type
        - plot_results:      True if results are to be plotted using
                             matplotlib along with a set of statistics

    RETURNS:
          - segs:           a sequence of segment's endpoints: segs[i] is the
                            endpoint of the i-th segment (in seconds)
          - classes:        a sequence of class flags: class[i] is the
                            class ID of the i-th segment
    """
    labels = []
    accuracy = 0.0
    class_names = []
    cm = np.array([])
    if not os.path.isfile(model_name):
        print("mtFileClassificationError: input model_type not found!")
        return labels, class_names, accuracy, cm

    # Load classifier:
    if model_type == "knn":
        classifier, mean, std, class_names, mt_win, mid_step, st_win, \
         st_step, compute_beat = at.load_model_knn(model_name)
    else:
        classifier, mean, std, class_names, mt_win, mid_step, st_win, \
         st_step, compute_beat = at.load_model(model_name)
    if compute_beat:
        print("Model " + model_name + " contains long-term music features "
                                      "(beat etc) and cannot be used in "
                                      "segmentation")
        return labels, class_names, accuracy, cm
    # load input file
    sampling_rate, signal = audioBasicIO.read_audio_file(input_file)

    # could not read file
    if sampling_rate == 0:
        return labels, class_names, accuracy, cm

    # convert stereo (if) to mono
    signal = audioBasicIO.stereo_to_mono(signal)

    # mid-term feature extraction:
    mt_feats, _, _ = \
        mtf.mid_feature_extraction(signal, sampling_rate,
                                   mt_win * sampling_rate,
                                   mid_step * sampling_rate,
                                   round(sampling_rate * st_win),
                                   round(sampling_rate * st_step))
    posterior_matrix = []

    # for each feature vector (i.e. for each fix-sized segment):
    for col_index in range(mt_feats.shape[1]):
        # normalize current feature v
        feature_vector = (mt_feats[:, col_index] - mean) / std

        # classify vector:
        label_predicted, posterior = \
            at.classifier_wrapper(classifier, model_type, feature_vector)
        labels.append(label_predicted)

        # update probability matrix
        posterior_matrix.append(np.max(posterior))
    labels = np.array(labels)

    # convert fix-sized flags to segments and classes
    segs, classes = labels_to_segments(labels, mid_step)
    for i in range(len(segs)):
        print(segs[i], classes[i])
    segs[-1] = len(signal) / float(sampling_rate)
    # Load grount-truth:
    labels_gt, class_names_gt, accuracy, cm = \
        load_ground_truth(gt_file, labels, class_names, mid_step, plot_results)

    return labels, class_names, accuracy, cm
Exemplo n.º 11
0
def speaker_diarization(filename, n_speakers, mid_window=1.0, mid_step=0.1,
                        short_window=0.1, lda_dim=0, plot_res=False):
    """
    ARGUMENTS:
        - filename:        the name of the WAV file to be analyzed
        - n_speakers       the number of speakers (clusters) in
                           the recording (<=0 for unknown)
        - mid_window (opt)    mid-term window size
        - mid_step (opt)    mid-term window step
        - short_window  (opt)    short-term window size
        - lda_dim (opt     LDA dimension (0 for no LDA)
        - plot_res         (opt)   0 for not plotting the results 1 for plotting
    """
    sampling_rate, signal = audioBasicIO.read_audio_file(filename)
    signal = audioBasicIO.stereo_to_mono(signal)
    duration = len(signal) / sampling_rate

    base_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "data/models")

    classifier_all, mean_all, std_all, class_names_all, _, _, _, _, _ = \
        at.load_model(os.path.join(base_dir, "svm_rbf_speaker_10"))
    classifier_fm, mean_fm, std_fm, class_names_fm, _, _, _, _,  _ = \
        at.load_model(os.path.join(base_dir, "svm_rbf_speaker_male_female"))


    mid_feats, st_feats, a = \
        mtf.mid_feature_extraction(signal, sampling_rate,
                                   mid_window * sampling_rate,
                                   mid_step * sampling_rate,
                                   round(sampling_rate * 0.05),
                                   round(sampling_rate * 0.05))

    mid_term_features = np.zeros((mid_feats.shape[0] + len(class_names_all) +
                                  len(class_names_fm), mid_feats.shape[1]))
    for index in range(mid_feats.shape[1]):
        feature_norm_all = (mid_feats[:, index] - mean_all) / std_all
        feature_norm_fm = (mid_feats[:, index] - mean_fm) / std_fm
        _, p1 = at.classifier_wrapper(classifier_all, "svm_rbf", feature_norm_all)
        _, p2 = at.classifier_wrapper(classifier_fm, "svm_rbf", feature_norm_fm)
        start = mid_feats.shape[0]
        end = mid_feats.shape[0] + len(class_names_all)
        mid_term_features[0:mid_feats.shape[0], index] = mid_feats[:, index]
        mid_term_features[start:end, index] = p1 + 1e-4
        mid_term_features[end::, index] = p2 + 1e-4
    # normalize features:
    scaler = StandardScaler()
    mid_feats_norm = scaler.fit_transform(mid_term_features.T)

    # remove outliers:
    dist_all = np.sum(distance.squareform(distance.pdist(mid_feats_norm.T)),
                      axis=0)
    m_dist_all = np.mean(dist_all)
    i_non_outliers = np.nonzero(dist_all < 1.1 * m_dist_all)[0]

    # TODO: Combine energy threshold for outlier removal:
    # EnergyMin = np.min(mt_feats[1,:])
    # EnergyMean = np.mean(mt_feats[1,:])
    # Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    # i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0]
    # print i_non_outliers

    mt_feats_norm_or = mid_feats_norm
    mid_feats_norm = mid_feats_norm[:, i_non_outliers]

    # LDA dimensionality reduction:
    if lda_dim > 0:

        # extract mid-term features with minimum step:
        window_ratio = int(round(mid_window / short_window))
        step_ratio = int(round(short_window / short_window))
        mt_feats_to_red = []
        num_of_features = len(st_feats)
        num_of_stats = 2
        for index in range(num_of_stats * num_of_features):
            mt_feats_to_red.append([])

        # for each of the short-term features:
        for index in range(num_of_features):
            cur_pos = 0
            feat_len = len(st_feats[index])
            while cur_pos < feat_len:
                n1 = cur_pos
                n2 = cur_pos + window_ratio
                if n2 > feat_len:
                    n2 = feat_len
                short_features = st_feats[index][n1:n2]
                mt_feats_to_red[index].append(np.mean(short_features))
                mt_feats_to_red[index + num_of_features].\
                    append(np.std(short_features))
                cur_pos += step_ratio
        mt_feats_to_red = np.array(mt_feats_to_red)
        mt_feats_to_red_2 = np.zeros((mt_feats_to_red.shape[0] +
                                      len(class_names_all) +
                                      len(class_names_fm),
                                      mt_feats_to_red.shape[1]))
        limit = mt_feats_to_red.shape[0] + len(class_names_all)
        for index in range(mt_feats_to_red.shape[1]):
            feature_norm_all = (mt_feats_to_red[:, index] - mean_all) / std_all
            feature_norm_fm = (mt_feats_to_red[:, index] - mean_fm) / std_fm
            _, p1 = at.classifier_wrapper(classifier_all, "svm_rbf",
                                          feature_norm_all)
            _, p2 = at.classifier_wrapper(classifier_fm, "svm_rbf", feature_norm_fm)
            mt_feats_to_red_2[0:mt_feats_to_red.shape[0], index] = \
                mt_feats_to_red[:, index]
            mt_feats_to_red_2[mt_feats_to_red.shape[0]:limit, index] = p1 + 1e-4
            mt_feats_to_red_2[limit::, index] = p2 + 1e-4
        mt_feats_to_red = mt_feats_to_red_2
        scaler = StandardScaler()
        mt_feats_to_red = scaler.fit_transform(mt_feats_to_red.T).T
        labels = np.zeros((mt_feats_to_red.shape[1], ))
        lda_step = 1.0
        lda_step_ratio = lda_step / short_window
        for index in range(labels.shape[0]):
            labels[index] = int(index * short_window / lda_step_ratio)
        clf = sklearn.discriminant_analysis.\
            LinearDiscriminantAnalysis(n_components=lda_dim)
        mid_feats_norm = clf.fit_transform(mt_feats_to_red.T, labels)
        #clf.fit(mt_feats_to_red.T, labels)
        #mid_feats_norm = (clf.transform(mid_feats_norm.T)).T
    if n_speakers <= 0:
        s_range = range(2, 10)
    else:
        s_range = [n_speakers]
    cluster_labels = []
    sil_all = []
    cluster_centers = []
    
    for speakers in s_range:
        k_means = sklearn.cluster.KMeans(n_clusters=speakers)
        k_means.fit(mid_feats_norm)
        cls = k_means.labels_ 
        cluster_labels.append(cls)
#        cluster_centers.append(means)
        sil_1 = []; sil_2 = []
        for c in range(speakers):
            # for each speaker (i.e. for each extracted cluster)
            clust_per_cent = np.nonzero(cls == c)[0].shape[0] / float(len(cls))
            if clust_per_cent < 0.020:
                sil_1.append(0.0)
                sil_2.append(0.0)
            else:
                # get subset of feature vectors
                mt_feats_norm_temp = mid_feats_norm[cls == c, :]
                # compute average distance between samples
                # that belong to the cluster (a values)
                dist = distance.pdist(mt_feats_norm_temp.T)
                sil_1.append(np.mean(dist)*clust_per_cent)
                sil_temp = []
                for c2 in range(speakers):
                    # compute distances from samples of other clusters
                    if c2 != c:
                        clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\
                                           float(len(cls))
                        mid_features_temp = mid_feats_norm[cls == c2, :]
                        dist = distance.cdist(mt_feats_norm_temp,
                                              mid_features_temp)
                        sil_temp.append(np.mean(dist)*(clust_per_cent
                                                       + clust_per_cent_2)/2.0)
                sil_temp = np.array(sil_temp)
                # ... and keep the minimum value (i.e.
                # the distance from the "nearest" cluster)
                sil_2.append(min(sil_temp))
        sil_1 = np.array(sil_1)
        sil_2 = np.array(sil_2)
        sil = []
        for c in range(speakers):
            # for each cluster (speaker) compute silhouette
            sil.append((sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 1e-5))
        # keep the AVERAGE SILLOUETTE
        sil_all.append(np.mean(sil))

    imax = int(np.argmax(sil_all))
    # optimal number of clusters
    num_speakers = s_range[imax]

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows:
    # this is achieved by giving them the value of their
    # nearest non-outlier window)
#    print(cls)
#    cls = np.zeros((n_wins,))
#    for index in range(n_wins):
#        j = np.argmin(np.abs(index-i_non_outliers))
#        cls[index] = cluster_labels[imax][j]
    # Post-process method 1: hmm smoothing
    if lda_dim <= 0 :
        for index in range(1):
            # hmm training
            start_prob, transmat, means, cov = \
                train_hmm_compute_statistics(mt_feats_norm_or.T, cls)
            hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")
            hmm.startprob_ = start_prob
            hmm.transmat_ = transmat            
            hmm.means_ = means; hmm.covars_ = cov
            cls = hmm.predict(mt_feats_norm_or)                        
    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 5)

    class_names = ["speaker{0:d}".format(c) for c in range(num_speakers)]

    # load ground-truth if available
    gt_file = filename.replace('.wav', '.segments')
    # if groundtruth exists
    if os.path.isfile(gt_file):
        seg_start, seg_end, seg_labs = read_segmentation_gt(gt_file)
        flags_gt, class_names_gt = segments_to_labels(seg_start, seg_end,
                                                      seg_labs, mid_step)

    if plot_res:
        fig = plt.figure()    
        if n_speakers > 0:
            ax1 = fig.add_subplot(111)
        else:
            ax1 = fig.add_subplot(211)
        ax1.set_yticks(np.array(range(len(class_names))))
        ax1.axis((0, duration, -1, len(class_names)))
        ax1.set_yticklabels(class_names)
        ax1.plot(np.array(range(len(cls))) * mid_step + mid_step / 2.0, cls)

    purity_cluster_m, purity_speaker_m = -1, -1
    if os.path.isfile(gt_file):
        if plot_res:
            ax1.plot(np.array(range(len(flags_gt))) *
                     mid_step + mid_step / 2.0, flags_gt, 'r')
        purity_cluster_m, purity_speaker_m = \
            evaluate_speaker_diarization(cls, flags_gt)
        print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m,
                                        100 * purity_speaker_m))
        if plot_res:
            plt.title("Cluster purity: {0:.1f}% - "
                      "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m,
                                                        100 * purity_speaker_m))
    if plot_res:
        plt.xlabel("time (seconds)")
        if n_speakers <= 0:
            plt.subplot(212)
            plt.plot(s_range, sil_all)
            plt.xlabel("number of clusters")
            plt.ylabel("average clustering's sillouette")
        plt.show()
    return cls, purity_cluster_m, purity_speaker_m
Exemplo n.º 12
0
def train_hmm_from_directory(folder_path, hmm_model_name, mid_window, mid_step):
    """
    This function trains a HMM model for segmentation-classification using
    a where WAV files and .segment (ground-truth files) are stored
    ARGUMENTS:
     - folder_path:     the path of the data diretory
     - hmm_model_name:  the name of the HMM model to be stored
     - mt_win:          mid-term window size
     - mt_step:         mid-term window step
    RETURNS:
     - hmm:            an object to the resulting HMM
     - class_names:    a list of class_names

    After training, hmm, class_names, along with the mt_win
    and mt_step values are stored in the hmm_model_name file
    """

    flags_all = np.array([])
    class_names_all = []
    for i, f in enumerate(glob.glob(folder_path + os.sep + '*.wav')):
        # for each WAV file
        wav_file = f
        gt_file = f.replace('.wav', '.segments')
        if os.path.isfile(gt_file):
            seg_start, seg_end, seg_labs = read_segmentation_gt(gt_file)
            flags, class_names = \
                segments_to_labels(seg_start, seg_end, seg_labs, mid_step)
            for c in class_names:
                # update class names:
                if c not in class_names_all:
                    class_names_all.append(c)
            sampling_rate, signal = audioBasicIO.read_audio_file(wav_file)
            feature_vector, _, _ = \
                mtf.mid_feature_extraction(signal, sampling_rate,
                                           mid_window * sampling_rate,
                                           mid_step * sampling_rate,
                                           round(sampling_rate * 0.050),
                                           round(sampling_rate * 0.050))

            flag_len = len(flags)
            feat_cols = feature_vector.shape[1]
            min_sm = min(feat_cols, flag_len)
            feature_vector = feature_vector[:, 0:min_sm]
            flags = flags[0:min_sm]

            flags_new = []
            # append features and labels
            for j, fl in enumerate(flags):
                flags_new.append(class_names_all.index(class_names_all[flags[j]]))

            flags_all = np.append(flags_all, np.array(flags_new))

            if i == 0:
                f_all = feature_vector
            else:
                f_all = np.concatenate((f_all, feature_vector), axis=1)

    # compute HMM statistics
    class_priors, transmutation_matrix, means, cov = \
        train_hmm_compute_statistics(f_all, flags_all)
    # train the HMM
    hmm = hmmlearn.hmm.GaussianHMM(class_priors.shape[0], "diag")
    hmm.covars_ = cov
    hmm.means_ = means
    hmm.startprob_ = class_priors
    hmm.transmat_ = transmutation_matrix

    save_hmm(hmm_model_name, hmm, class_names_all, mid_window, mid_step)

    return hmm, class_names_all
def featureAndTrain(list_of_dirs,
                    mt_win,
                    mt_step,
                    st_win,
                    st_step,
                    classifier_type,
                    model_name,
                    compute_beat=False,
                    perTrain=0.90):
    """
    This function is used as a wrapper to segment-based audio feature extraction
    and classifier training.
    ARGUMENTS:
        list_of_dirs:        list of paths of directories. Each directory
                             contains a signle audio class whose samples
                             are stored in seperate WAV files.
        mt_win, mt_step:        mid-term window length and step
        st_win, st_step:        short-term window and step
        classifier_type:        "svm" or "knn" or "randomforest" or
                             "gradientboosting" or "extratrees"
        model_name:          name of the model to be saved
    RETURNS:
        None. Resulting classifier along with the respective model
        parameters are saved on files.
    """

    # STEP A: Feature Extraction:
    [features, classNames,
     _] = aF.multiple_directory_feature_extraction(list_of_dirs,
                                                   mt_win,
                                                   mt_step,
                                                   st_win,
                                                   st_step,
                                                   compute_beat=compute_beat)

    if len(features) == 0:
        print("trainSVM_feature ERROR: No data found in any input folder!")
        return

    n_feats = features[0].shape[1]
    feature_names = ["features" + str(d + 1) for d in range(n_feats)]

    writeTrainDataToARFF(model_name, features, classNames, feature_names)

    for i, f in enumerate(features):
        if len(f) == 0:
            print("trainSVM_feature ERROR: " + list_of_dirs[i] +
                  " folder is empty or non-existing!")
            return

    # STEP B: classifier Evaluation and Parameter Selection:
    if classifier_type == "svm" or classifier_type == "svm_rbf":
        classifier_par = np.array([0.001, 0.01, 0.5, 1.0, 5.0, 10.0, 20.0])
    elif classifier_type == "randomforest":
        classifier_par = np.array([10, 25, 50, 100, 200, 500])
    elif classifier_type == "knn":
        classifier_par = np.array([1, 3, 5, 7, 9, 11, 13, 15])
    elif classifier_type == "gradientboosting":
        classifier_par = np.array([10, 25, 50, 100, 200, 500])
    elif classifier_type == "extratrees":
        classifier_par = np.array([10, 25, 50, 100, 200, 500])

    # get optimal classifeir parameter:
    features2 = []
    for f in features:
        fTemp = []
        for i in range(f.shape[0]):
            temp = f[i, :]
            if (not np.isnan(temp).any()) and (not np.isinf(temp).any()):
                fTemp.append(temp.tolist())
            else:
                print("NaN Found! Feature vector not used for training")
        features2.append(np.array(fTemp))
    features = features2

    bestParam = evaluateclassifier(features, classNames, 100, classifier_type,
                                   classifier_par, 0, perTrain)

    print("Selected params: {0:.5f}".format(bestParam))

    C = len(classNames)
    [features_norm, MEAN, STD] = normalizeFeatures(features)
    MEAN = MEAN.tolist()
    STD = STD.tolist()
    featuresNew = features_norm

    # STEP C: Save the classifier to file
    if classifier_type == "svm":
        classifier = trainSVM(featuresNew, bestParam)
    elif classifier_type == "svm_rbf":
        classifier = trainSVM_RBF(featuresNew, bestParam)
    elif classifier_type == "randomforest":
        classifier = trainRandomForest(featuresNew, bestParam)
    elif classifier_type == "gradientboosting":
        classifier = trainGradientBoosting(featuresNew, bestParam)
    elif classifier_type == "extratrees":
        classifier = trainExtraTrees(featuresNew, bestParam)

    if classifier_type == "knn":
        [X, Y] = listOfFeatures2Matrix(featuresNew)
        X = X.tolist()
        Y = Y.tolist()
        fo = open(model_name, "wb")
        cPickle.dump(X, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(Y, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(bestParam, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(st_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(st_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(compute_beat, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        fo.close()
    elif classifier_type == "svm" or classifier_type == "svm_rbf" or \
                    classifier_type == "randomforest" or \
                    classifier_type == "gradientboosting" or \
                    classifier_type == "extratrees":
        with open(model_name, 'wb') as fid:
            cPickle.dump(classifier, fid)
        fo = open(model_name + "MEANS", "wb")
        cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(st_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(st_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(compute_beat, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        fo.close()
Exemplo n.º 14
0
"""! 
@brief Example 12
@details pyAudioAnalysis feature extraction for classes organized in folders
and feature histogram representation (per feature and class).
Binary classification task: male vs female speech segments
@author Theodoros Giannakopoulos {[email protected]}
"""
from pyAudioAnalysis import MidTermFeatures as aF
import os.path
import utilities as ut

if __name__ == '__main__':
    dirs = ["../data/gender/male", "../data/gender/female"]
    class_names = [os.path.basename(d) for d in dirs]
    m_win, m_step, s_win, s_step = 1, 1, 0.1, 0.05
    features = []
    for d in dirs:
        # get feature matrix for each directory (class)
        f, files, fn = aF.directory_feature_extraction(d, m_win, m_step, s_win,
                                                       s_step)
        features.append(f)
    ut.plot_feature_histograms(features, fn, class_names)
# extract short-term features using a 50msec non-overlapping windows
win, step = 0.050, 0.050
[f, fn] = aFs.feature_extraction(s, fs, int(fs * win),
                                int(fs * step))
print(f'{f.shape[1]} frames, {f.shape[0]} short-term features')
print('Feature names:')
for i, nam in enumerate(fn):
    print(f'{i}:{nam}')
# plot short-term energy
# create time axis in seconds
time = np.arange(0, duration - step, win)
# get the feature whose name is 'energy'
energy = f[fn.index('energy'), :]
mylayout = go.Layout(yaxis=dict(title="frame energy value"),
                     xaxis=dict(title="time (sec)"))
plotly.offline.iplot(go.Figure(data=[go.Scatter(x=time,
                                                y=energy)],
                               layout=mylayout))


# get mid-term (segment) feature statistics
# and respective short-term features:
mt, st, mt_n = aFm.mid_feature_extraction(s, fs, 1 * fs, 1 * fs,
                                         0.05 * fs, 0.05 * fs)
print(f'signal duration {len(s)/fs} seconds')
print(f'{st.shape[1]} {st.shape[0]}-D short-term feature vectors extracted')
print(f'{mt.shape[1]} {mt.shape[0]}-D segment feature statistic vectors extracted')
print('mid-term feature names')
for i, mi in enumerate(mt_n):
    print(f'{i}:{mi}')
Exemplo n.º 16
0
def audio_based_feature_extraction(input_file,
                                   models_directory,
                                   raudio_features_discard=0,
                                   pyaudio_num_features="all",
                                   mode=0,
                                   pyaudio_params=None):
    """
        Export all features for a wav file (silence based + classifiers based)
        :param input_file: the audio file
        :param models_directory: the directory which contains all trained
        classifiers (models' files + MEANS files)
        :return: features , feature_names , metadata
    """
    # A. silence features
    fs, dur = get_wav_properties(input_file)
    fs, x = aio.read_audio_file(input_file)

    print(input_file)
    print(len(x) / fs)
    # get the silence estimates using pyAudioAnalysis semi-supervised approach
    # for different windows and steps
    if dur < 6.2:
        seg_limits_short = [[0, dur]]
        seg_limits_long = [[0, dur]]
    else:
        seg_limits_short = aS.silence_removal(x, fs, 0.5, 0.25, 0.5)
        seg_limits_long = aS.silence_removal(x, fs, 1.0, 0.25, 0.5)

    # short windows
    silence_features_short, number_of_pauses_short, total_speech_short = \
        silence_features(seg_limits_short, dur)
    # long windows
    silence_features_long, number_of_pauses_long, total_speech_long = \
        silence_features(seg_limits_long, dur)

    features = []
    feature_names = []

    if mode < 2:

        # B. segment model-based features
        # Load classifier:
        dictionaries = []
        for filename in os.listdir(models_directory):
            model_path = os.path.join(models_directory, filename)
            dictionary = predict_audio_labels(input_file, model_path)[0]
            dictionaries.append(dictionary)

        # list of features and feature names
        feature_names = [
            "Average silence duration short (sec)",
            "Average silence duration long (sec)",
            "Silence segments per minute short (segments/min)",
            "Silence segments per minute long (segments/min)", "Std short",
            "Std long", "Speech ratio short (sec)", "Speech ratio long (sec)",
            "Word rate in speech short (words/sec)",
            "Word rate in speech long (words/sec)"
        ]

        for i in range(len(silence_features_short)):
            features.append(silence_features_short[i])
            features.append(silence_features_long[i])
        for dictionary in dictionaries:
            for label in dictionary:
                feature_string = label + "(%)"
                feature_value = dictionary[label]
                feature_names.append(feature_string)
                features.append(feature_value)
        if raudio_features_discard != 0:
            features = features[raudio_features_discard:]
            feature_names = feature_names[raudio_features_discard:]

    # C. pyaudio features
    if mode > 0:
        (segment_features_stats, segment_features,
         pyaudio_feature_names) = aF.mid_feature_extraction(
             x, fs, round(pyaudio_params['mid_window'] * fs),
             round(pyaudio_params['mid_step'] * fs),
             round(fs * pyaudio_params['short_window']),
             round(fs * pyaudio_params['short_step']))
        pyaudio_list = list(segment_features_stats.mean(axis=1))
        if pyaudio_num_features != "all":
            #pyaudio_num_features = int(pyaudio_num_features)
            pyaudio_list = pyaudio_list[:pyaudio_num_features - 1]
            pyaudio_feature_names = pyaudio_feature_names[:pyaudio_num_features
                                                          - 1]
        features = features + pyaudio_list
        feature_names = feature_names + pyaudio_feature_names

    metadata = {
        "Number of pauses short": number_of_pauses_short,
        "Number of pauses long": number_of_pauses_long,
        "Total speech duration short (sec)": total_speech_short,
        "Total speech duration long (sec)": total_speech_long
    }
    return features, feature_names, metadata
Exemplo n.º 17
0
import numpy as np

#extract some audio

VIDEOFILE = "../data/raw/8/replay.mp4"
AUDIOFILE = "./extracted.wav"
FEATUREFILE = "./extracted.ft"

command = f"ffmpeg -i {VIDEOFILE} -vn {AUDIOFILE} -y"

subprocess.call(command, shell=True)

[Fs, x] = audioBasicIO.read_audio_file(AUDIOFILE)
x = audioBasicIO.stereo_to_mono(x)

midF, shortF, midFNames = MidTermFeatures.mid_feature_extraction(x,Fs, 0.1*Fs,0.05*Fs,0.05*Fs,0.025*Fs)

np.save(FEATUREFILE, midF)
np.savetxt(FEATUREFILE + ".csv", midF.T, delimiter=",", header=",".join(midFNames))
#%%
audioAnalysis.thumbnailWrapper(AUDIOFILE,50)
#explore the audio

audioAnalysis.fileSpectrogramWrapper(AUDIOFILE)

audioAnalysis.fileChromagramWrapper(AUDIOFILE)

audioAnalysis.beatExtractionWrapper(AUDIOFILE, True)
#%%
var = 48
print(f"{var} : {3/199:.5f}")
Exemplo n.º 18
0
    def exp5():
        print('pyAudioAnalysis example 5')
        dirs = [
            '{0}music/classical'.format(AfeExp.data_folder),
            '{0}music/metal'.format(AfeExp.data_folder)
        ]
        class_names = ['classical', 'metal']
        m_win, m_step, s_win, s_step = 1, 1, 0.1, 0.05
        features = []
        for d in dirs:  # get feature matrix for each directory (class)
            f, files, fn = aMF.directory_feature_extraction(
                d, m_win, m_step, s_win, s_step)
            features.append(f)
        print(features[0].shape, features[1].shape)
        f1 = np.array([
            features[0][:, fn.index('spectral_centroid_mean')],
            features[0][:, fn.index('energy_entropy_mean')]
        ])
        f2 = np.array([
            features[1][:, fn.index('spectral_centroid_mean')],
            features[1][:, fn.index('energy_entropy_mean')]
        ])

        print('f1 type:{0}; shape:{1}; value:{2};'.format(
            type(f1), f1.shape, f1))
        print('f2 type:{0}; shape:{1}; value:{2};'.format(
            type(f2), f2.shape, f2))

        y = np.concatenate((np.zeros(f1.shape[1]), np.ones(f2.shape[1])))
        f = np.concatenate((f1.T, f2.T), axis=0)
        print('y: {0}; {1};'.format(y.shape, y))
        print('X: {0}; {1};'.format(f.shape, f))
        # train the svm classifier
        cl = sks.SVC(kernel='rbf', C=20)
        cl.fit(f, y)

        p1 = go.Scatter(x=f1[0, :],
                        y=f1[1, :],
                        name=class_names[0],
                        marker=dict(size=10, color='rgba(255, 182, 193, .9)'),
                        mode='markers')
        p2 = go.Scatter(x=f2[0, :],
                        y=f2[1, :],
                        name=class_names[1],
                        marker=dict(size=10, color='rgba(100, 100, 220, .9)'),
                        mode='markers')
        mylayout = go.Layout(xaxis=dict(title="spectral_centroid_mean"),
                             yaxis=dict(title="energy_entropy_mean"))

        # apply the trained model on the points of a grid
        x_ = np.arange(f[:, 0].min(), f[:, 0].max(), 0.002)
        y_ = np.arange(f[:, 1].min(), f[:, 1].max(), 0.002)
        xx, yy = np.meshgrid(x_, y_)
        X_t = np.c_[xx.ravel(), yy.ravel()]
        print('X_t: {0};'.format(X_t.shape))
        Z = cl.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape) / 2
        # and visualize the grid on the same plot (decision surfaces)
        cs = go.Heatmap(x=x_,
                        y=y_,
                        z=Z,
                        showscale=False,
                        colorscale=[[0, 'rgba(255, 182, 193, .3)'],
                                    [1, 'rgba(100, 100, 220, .3)']])
        mylayout = go.Layout(xaxis=dict(title="spectral_centroid_mean"),
                             yaxis=dict(title="energy_entropy_mean"))
        #plotly.offline.iplot(go.Figure(data=[p1, p2, cs], layout=mylayout))
        plotly.offline.plot({
            'data': [p1, p2, cs],
            'layout': mylayout
        },
                            auto_open=True)
Exemplo n.º 19
0
def fileGreenwaySpeakerDiarization(filename, output_folder, speech_key="52fe944f29784ae288482e5eb3092e2a", service_region="eastus2",
                                   n_speakers=2, mt_size=2.0, mt_step=0.2,
                                   st_win=0.05, lda_dim=35):
    """
    ARGUMENTS:
        - filename:        the name of the WAV file to be analyzed
                            the filename should have a suffix of the form: ..._min_3
                            this informs the service that audio file corresponds to the 3rd minute of the dialogue
        - output_folder    the folder location for saving the audio snippets generated from diarization                           
        - speech_key       mid-term window size            
        - service_region       the number of speakers (clusters) in
                           the recording (<=0 for unknown)
        - n_speakers       the number of speakers (clusters) in
                           the recording (<=0 for unknown)
        - mt_size (opt)    mid-term window size
        - mt_step (opt)    mid-term window step
        - st_win  (opt)    short-term window size
        - lda_dim (opt     LDA dimension (0 for no LDA)
        - plot_res         (opt)   0 for not plotting the results 1 for plotting
        - save_plot        (opt)   1|True for saving plot in output folder
    """
    '''
    OUTPUTS:
        - cls:             this is a vector with speaker ids in chronological sequence of speaker dialogue.
        - output:          a list of python dictionaries containing dialogue sequence information.
                            - dialogue_id
                            - sequence_id
                            - start_time
                            - end_time
                            - text
    '''

    filename_only = filename if "/" not in filename else filename.split("/")[-1]
    nameoffile = filename_only.split("_min_")[0]
    timeoffile = filename_only.split("_min_")[1]

    [fs, x] = audioBasicIO.read_audio_file(filename)
    x = audioBasicIO.stereo_to_mono(x)
    duration = len(x) / fs

    [classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.load_model_knn(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "pyAudioAnalysis/data/models", "knn_speaker_10"))
    [classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.load_model_knn(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "pyAudioAnalysis/data/models", "knn_speaker_male_female"))

    [mt_feats, st_feats, _] = aF.mid_feature_extraction(x, fs, mt_size * fs,
                                                        mt_step * fs,
                                                        round(fs * st_win),
                                                        round(fs*st_win * 0.5))

    MidTermFeatures2 = np.zeros((mt_feats.shape[0] + len(classNames1) +
                                 len(classNames2), mt_feats.shape[1]))

    for i in range(mt_feats.shape[1]):
        cur_f1 = (mt_feats[:, i] - MEAN1) / STD1
        cur_f2 = (mt_feats[:, i] - MEAN2) / STD2
        [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
        [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
        MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i]
        MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0] +
                         len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[mt_feats.shape[0] +
                         len(classNames1)::, i] = P2 + 0.0001

    mt_feats = MidTermFeatures2    # TODO
    iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41,
                       42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]

    mt_feats = mt_feats[iFeaturesSelect, :]

    (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T])
    mt_feats_norm = mt_feats_norm[0].T
    n_wins = mt_feats.shape[1]

    # remove outliers:
    dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_norm.T)),
                      axis=0)
    m_dist_all = np.mean(dist_all)
    i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0]

    # TODO: Combine energy threshold for outlier removal:
    #EnergyMin = np.min(mt_feats[1,:])
    #EnergyMean = np.mean(mt_feats[1,:])
    #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    #i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0]
    # print i_non_outliers

    perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins
    mt_feats_norm_or = mt_feats_norm
    mt_feats_norm = mt_feats_norm[:, i_non_outliers]

    # LDA dimensionality reduction:
    if lda_dim > 0:
        # [mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs,
        # st_win * fs, round(fs*st_win), round(fs*st_win));
        # extract mid-term features with minimum step:
        mt_win_ratio = int(round(mt_size / st_win))
        mt_step_ratio = int(round(st_win / st_win))
        mt_feats_to_red = []
        num_of_features = len(st_feats)
        num_of_stats = 2
        # for i in range(num_of_stats * num_of_features + 1):
        for i in range(num_of_stats * num_of_features):
            mt_feats_to_red.append([])

        # for each of the short-term features:
        for i in range(num_of_features):
            curPos = 0
            N = len(st_feats[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mt_win_ratio
                if N2 > N:
                    N2 = N
                curStFeatures = st_feats[i][N1:N2]
                mt_feats_to_red[i].append(np.mean(curStFeatures))
                mt_feats_to_red[i +
                                num_of_features].append(np.std(curStFeatures))
                curPos += mt_step_ratio
        mt_feats_to_red = np.array(mt_feats_to_red)
        mt_feats_to_red_2 = np.zeros((mt_feats_to_red.shape[0] +
                                      len(classNames1) + len(classNames2),
                                      mt_feats_to_red.shape[1]))
        for i in range(mt_feats_to_red.shape[1]):
            cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1
            cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2
            [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
            [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
            mt_feats_to_red_2[0:mt_feats_to_red.shape[0],
                              i] = mt_feats_to_red[:, i]
            mt_feats_to_red_2[mt_feats_to_red.shape[0]                              :mt_feats_to_red.shape[0] + len(classNames1), i] = P1 + 0.0001
            mt_feats_to_red_2[mt_feats_to_red.shape[0] +
                              len(classNames1)::, i] = P2 + 0.0001
        mt_feats_to_red = mt_feats_to_red_2
        mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :]
        #mt_feats_to_red += np.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010
        (mt_feats_to_red, MEAN, STD) = aT.normalizeFeatures(
            [mt_feats_to_red.T])
        mt_feats_to_red = mt_feats_to_red[0].T
        #dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0)
        #m_dist_all = np.mean(dist_all)
        #iNonOutLiers2 = np.nonzero(dist_all < 3.0*m_dist_all)[0]
        #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2]
        Labels = np.zeros((mt_feats_to_red.shape[1], ))
        LDAstep = 1.0
        LDAstepRatio = LDAstep / st_win
        # print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i*st_win/LDAstepRatio)
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
            n_components=lda_dim)
        clf.fit(mt_feats_to_red.T, Labels)
        mt_feats_norm = (clf.transform(mt_feats_norm.T)).T

    if n_speakers <= 0:
        s_range = range(2, 10)
    else:
        s_range = [n_speakers]
    clsAll = []
    sil_all = []
    centersAll = []

    for iSpeakers in s_range:
        k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers)
        k_means.fit(mt_feats_norm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        # Y = distance.squareform(distance.pdist(mt_feats_norm.T))
        clsAll.append(cls)
        centersAll.append(means)
        sil_1 = []
        sil_2 = []
        for c in range(iSpeakers):
            # for each speaker (i.e. for each extracted cluster)
            clust_per_cent = np.nonzero(cls == c)[0].shape[0] / \
                float(len(cls))
            if clust_per_cent < 0.020:
                sil_1.append(0.0)
                sil_2.append(0.0)
            else:
                # get subset of feature vectors
                mt_feats_norm_temp = mt_feats_norm[:, cls == c]
                # compute average distance between samples
                # that belong to the cluster (a values)
                Yt = distance.pdist(mt_feats_norm_temp.T)
                sil_1.append(np.mean(Yt)*clust_per_cent)
                silBs = []
                for c2 in range(iSpeakers):
                    # compute distances from samples of other clusters
                    if c2 != c:
                        clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\
                            float(len(cls))
                        MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2]
                        Yt = distance.cdist(mt_feats_norm_temp.T,
                                            MidTermFeaturesNormTemp2.T)
                        silBs.append(np.mean(Yt)*(clust_per_cent
                                                  + clust_per_cent_2)/2.0)
                silBs = np.array(silBs)
                # ... and keep the minimum value (i.e.
                # the distance from the "nearest" cluster)
                sil_2.append(min(silBs))
        sil_1 = np.array(sil_1)
        sil_2 = np.array(sil_2)
        sil = []
        for c in range(iSpeakers):
            # for each cluster (speaker) compute silhouette
            sil.append((sil_2[c] - sil_1[c]) / (max(sil_2[c],
                                                    sil_1[c]) + 0.00001))
        # keep the AVERAGE SILLOUETTE
        sil_all.append(np.mean(sil))

    imax = np.argmax(sil_all)
    # optimal number of clusters
    nSpeakersFinal = s_range[imax]

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows:
    # this is achieved by giving them the value of their
    # nearest non-outlier window)
    cls = np.zeros((n_wins,))
    for i in range(n_wins):
        j = np.argmin(np.abs(i-i_non_outliers))
        cls[i] = clsAll[imax][j]

    # Post-process method 1: hmm smoothing
    for i in range(1):
        # hmm training
        start_prob, transmat, means, cov = \
            trainHMM_computeStatistics(mt_feats_norm_or, cls)
        hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")
        hmm.startprob_ = start_prob
        hmm.transmat_ = transmat
        hmm.means_ = means
        hmm.covars_ = cov
        cls = hmm.predict(mt_feats_norm_or.T)

    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = sil_all[imax]
    class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]

    # load ground-truth if available
    gt_file = filename.replace('.wav', '.segments')
    # if groundturh exists
    if os.path.isfile(gt_file):
        [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file)
        flags_gt, class_names_gt = segs2flags(
            seg_start, seg_end, seg_labs, mt_step)

    # if plot_res:
    #     fig = plt.figure()
    #     if n_speakers > 0:
    #         ax1 = fig.add_subplot(111)
    #     else:
    #         ax1 = fig.add_subplot(211)
    #     ax1.set_yticks(np.array(range(len(class_names))))
    #     ax1.axis((0, duration, -1, len(class_names)))
    #     ax1.set_yticklabels(class_names)
    #     ax1.plot(np.array(range(len(cls)))*mt_step+mt_step/2.0, cls)

    # if os.path.isfile(gt_file):
    #     if plot_res:
    #         ax1.plot(np.array(range(len(flags_gt))) *
    #                  mt_step + mt_step / 2.0, flags_gt, 'r')
    #     purity_cluster_m, purity_speaker_m = \
    #         evaluateSpeakerDiarization(cls, flags_gt)
    #     print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m,
    #                                     100 * purity_speaker_m))
        # if plot_res:
        #     plt.title("Cluster purity: {0:.1f}% - "
        #               "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m,
        #                                                 100 * purity_speaker_m))
    # if plot_res:
    #     plt.xlabel("time (seconds)")
    #     # print s_range, sil_all
    #     if n_speakers <= 0:
    #         plt.subplot(212)
    #         plt.plot(s_range, sil_all)
    #         plt.xlabel("number of clusters")
    #         plt.ylabel("average clustering's sillouette")
    #     if save_plot:
    #         plt.savefig(
    #             f"{output_folder}{filename_only}".replace(".wav", ".png"))
    #     else:
    #         pass
    #     plt.show()

    # Create Time Vector
    time_vec = np.array(range(len(cls)))*mt_step+mt_step/2.0

    # Find Change Points
    speaker_change_index = np.where(np.roll(cls, 1) != cls)[0]

    # Create List of dialogue convos
    output_list = []
    temp = {}
    for ind, sc in enumerate(speaker_change_index):
        temp['dialogue_id'] = str(datetime.now()).strip()
        temp['sequence_id'] = str(ind)
        temp['speaker'] = list(cls)[sc]
        temp['start_time'] = time_vec[sc]
        temp['end_time'] = time_vec[speaker_change_index[ind+1] -
                                    1] if ind+1 < len(speaker_change_index) else time_vec[-1]
        temp["text"] = ""
        output_list.append(temp)
        temp = {}

    def snip_transcribe(output_list, filename, output_folder=output_folder,
                        speech_key=speech_key, service_region=service_region):
        speech_config = speechsdk.SpeechConfig(
            subscription=speech_key, region=service_region)
        speech_config.enable_dictation

        def recognized_cb(evt):
            if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
                # Do something with the recognized text
                output_list[ind]['text'] = output_list[ind]['text'] + \
                    str(evt.result.text)
                print(evt.result.text)

        for ind, diag in enumerate(output_list):
            t1 = diag['start_time']
            t2 = diag['end_time']
            newAudio = AudioSegment.from_wav(filename)
            chunk = newAudio[t1*1000:t2*1000]
            filename_out = output_folder + f"snippet_{diag['sequence_id']}.wav"
            # Exports to a wav file in the current path.
            chunk.export(filename_out, format="wav")
            done = False

            def stop_cb(evt):
                """callback that signals to stop continuous recognition upon receiving an event `evt`"""
                print('CLOSING on {}'.format(evt))
                nonlocal done
                done = True

            audio_input = speechsdk.AudioConfig(filename=filename_out)
            speech_recognizer = speechsdk.SpeechRecognizer(
                speech_config=speech_config, audio_config=audio_input)
            output_list[ind]['snippet_path'] = filename_out

            speech_recognizer.recognized.connect(recognized_cb)

            speech_recognizer.session_stopped.connect(stop_cb)
            speech_recognizer.canceled.connect(stop_cb)

            # Start continuous speech recognition
            speech_recognizer.start_continuous_recognition()
            while not done:
                time.sleep(.5)

            speech_recognizer.stop_continuous_recognition()

        return output_list

    output = snip_transcribe(output_list, filename,
                             output_folder=output_folder)
    output_json = {filename_only: output}

    with open(f"{output_folder}{nameoffile}_{timeoffile}.txt", "w") as outfile:
        json.dump(output_json, outfile)

    return cls, output_json
Exemplo n.º 20
0
The pyAudioAnalysis has two functions in order to extract a bunch of useful
features from a wav file.
'''

from pyAudioAnalysis import MidTermFeatures as mF
import numpy as np
import pandas as pd
import os

basepath_train_cough = 'C:/Users/Guillem/Desktop/HACKATHON 2020/Unlabeled audio/TRAIN/Cough/'
basepath_train_nocough = 'C:/Users/Guillem/Desktop/HACKATHON 2020/Unlabeled audio/TRAIN/No_Cough/'

[mid_term_features_cough, wav_file_list_cough,
 mid_feature_names] = mF.directory_feature_extraction(basepath_train_cough,
                                                      0.1,
                                                      0.1,
                                                      0.01,
                                                      0.01,
                                                      compute_beat=False)
[mid_term_features_nocough, wav_file_list_nocough,
 mid_feature_names] = mF.directory_feature_extraction(basepath_train_nocough,
                                                      0.1,
                                                      0.1,
                                                      0.01,
                                                      0.01,
                                                      compute_beat=False)

label_nocough = np.zeros(np.shape(mid_term_features_nocough)[0])
label_cough = np.ones(np.shape(mid_term_features_cough)[0])

features = np.concatenate(
    (mid_term_features_cough,
Exemplo n.º 21
0
def main(argv):
    if argv[1] == "-shortTerm":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.clock()
            F = MidTermFeatures.short_term_feature_extraction(
                x, Fs, 0.050 * Fs, 0.050 * Fs)
            t2 = time.clock()
            perTime1 = duration / (t2 - t1)
            print "short-term feature extraction: {0:.1f} x realtime".format(
                perTime1)
    elif argv[1] == "-classifyFile":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.clock()
            aT.fileClassification("diarizationExample.wav", "svmSM", "svm")
            t2 = time.clock()
            perTime1 = duration / (t2 - t1)
            print "Mid-term feature extraction + classification \t {0:.1f} x realtime".format(
                perTime1)
    elif argv[1] == "-mtClassify":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.clock()
            [flagsInd, classesAll,
             acc] = aS.mtFileClassification("diarizationExample.wav", "svmSM",
                                            "svm", False, '')
            t2 = time.clock()
            perTime1 = duration / (t2 - t1)
            print "Fix-sized classification - segmentation \t {0:.1f} x realtime".format(
                perTime1)
    elif argv[1] == "-hmmSegmentation":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.clock()
            aS.hmmSegmentation('diarizationExample.wav', 'hmmRadioSM', False,
                               '')
            t2 = time.clock()
            perTime1 = duration / (t2 - t1)
            print "HMM-based classification - segmentation \t {0:.1f} x realtime".format(
                perTime1)
    elif argv[1] == "-silenceRemoval":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.clock()
            [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav")
            segments = aS.silenceRemoval(x,
                                         Fs,
                                         0.050,
                                         0.050,
                                         smoothWindow=1.0,
                                         Weight=0.3,
                                         plot=False)
            t2 = time.clock()
            perTime1 = duration / (t2 - t1)
            print "Silence removal \t {0:.1f} x realtime".format(perTime1)
    elif argv[1] == "-thumbnailing":
        for i in range(nExp):
            [Fs1, x1] = audioBasicIO.read_audio_file("scottish.wav")
            duration1 = x1.shape[0] / float(Fs1)
            t1 = time.clock()
            [A1, A2, B1, B2,
             Smatrix] = aS.musicThumbnailing(x1, Fs1, 1.0, 1.0,
                                             15.0)  # find thumbnail endpoints
            t2 = time.clock()
            perTime1 = duration1 / (t2 - t1)
            print "Thumbnail \t {0:.1f} x realtime".format(perTime1)
    elif argv[1] == "-diarization-noLDA":
        for i in range(nExp):
            [Fs1, x1] = audioBasicIO.read_audio_file("diarizationExample.wav")
            duration1 = x1.shape[0] / float(Fs1)
            t1 = time.clock()
            aS.speakerDiarization("diarizationExample.wav",
                                  4,
                                  LDAdim=0,
                                  PLOT=False)
            t2 = time.clock()
            perTime1 = duration1 / (t2 - t1)
            print "Diarization \t {0:.1f} x realtime".format(perTime1)
    elif argv[1] == "-diarization-LDA":
        for i in range(nExp):
            [Fs1, x1] = audioBasicIO.read_audio_file("diarizationExample.wav")
            duration1 = x1.shape[0] / float(Fs1)
            t1 = time.clock()
            aS.speakerDiarization("diarizationExample.wav", 4, PLOT=False)
            t2 = time.clock()
            perTime1 = duration1 / (t2 - t1)
            print "Diarization \t {0:.1f} x realtime".format(perTime1)
Exemplo n.º 22
0
def trainHMM_fromDir(dirPath, hmm_model_name, mt_win, mt_step):
    """
    This function trains a HMM model for segmentation-classification using
    a where WAV files and .segment (ground-truth files) are stored
    ARGUMENTS:
     - dirPath:        the path of the data diretory
     - hmm_model_name:    the name of the HMM model to be stored
     - mt_win:        mid-term window size
     - mt_step:        mid-term window step
    RETURNS:
     - hmm:            an object to the resulting HMM
     - class_names:        a list of class_names

    After training, hmm, class_names, along with the mt_win
    and mt_step values are stored in the hmm_model_name file
    """

    flags_all = np.array([])
    classes_all = []
    for i, f in enumerate(glob.glob(dirPath + os.sep + '*.wav')):
        # for each WAV file
        wav_file = f
        gt_file = f.replace('.wav', '.segments')
        if not os.path.isfile(gt_file):
            continue
        [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file)
        flags, class_names = segs2flags(seg_start, seg_end, seg_labs, mt_step)
        for c in class_names:
            # update class names:
            if c not in classes_all:
                classes_all.append(c)
        [fs, x] = audioBasicIO.read_audio_file(wav_file)
        [F, _, _] = aF.mid_feature_extraction(x, fs, mt_win * fs,
                                              mt_step * fs, round(fs * 0.050),
                                              round(fs * 0.050))

        lenF = F.shape[1]
        lenL = len(flags)
        min_sm = min(lenF, lenL)
        F = F[:, 0:min_sm]
        flags = flags[0:min_sm]

        flagsNew = []
        for j, fl in enumerate(flags):      # append features and labels
            flagsNew.append(classes_all.index(class_names[flags[j]]))

        flags_all = np.append(flags_all, np.array(flagsNew))

        if i == 0:
            f_all = F
        else:
            f_all = np.concatenate((f_all, F), axis=1)

    # compute HMM statistics
    start_prob, transmat, means, cov = trainHMM_computeStatistics(f_all,
                                                                  flags_all)
    # train the HMM
    hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")
    hmm.startprob_ = start_prob
    hmm.transmat_ = transmat        
    hmm.means_ = means
    hmm.covars_ = cov

    fo = open(hmm_model_name, "wb")   # save HMM model
    cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(classes_all, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    fo.close()

    return hmm, classes_all
Exemplo n.º 23
0
def feature_extraction_train_regression(folder_name,
                                        mid_window,
                                        mid_step,
                                        short_window,
                                        short_step,
                                        model_type,
                                        model_name,
                                        compute_beat=False):
    """
    This function is used as a wrapper to segment-based audio
    feature extraction and classifier training.
    ARGUMENTS:
        folder_name:        path of directory containing the WAV files
                         and Regression CSVs
        mt_win, mt_step:        mid-term window length and step
        st_win, st_step:        short-term window and step
        model_type:        "svm" or "knn" or "randomforest"
        model_name:        name of the model to be saved
    RETURNS:
        None. Resulting regression model along with the respective
        model parameters are saved on files.
    """
    # STEP A: Feature Extraction:
    features, _, filenames = \
        aF.multiple_directory_feature_extraction([folder_name], mid_window,
                                                 mid_step, short_window,
                                                 short_step,
                                                 compute_beat=compute_beat)
    features = features[0]
    filenames = [ntpath.basename(f) for f in filenames[0]]
    f_final = []

    # Read CSVs:
    csv_files = glob.glob(folder_name + os.sep + "*.csv")
    regression_labels = []
    regression_names = []
    f_final = []
    for c in csv_files:
        cur_regression_labels = []
        f_temp = []
        # open the csv file that contains the current target value's annotations
        with open(c, 'rt') as csvfile:
            csv_reader = csv.reader(csvfile, delimiter=',', quotechar='|')
            for row in csv_reader:
                if len(row) == 2:
                    # ... and if the current filename exists
                    # in the list of filenames
                    if row[0] in filenames:
                        index = filenames.index(row[0])
                        cur_regression_labels.append(float(row[1]))
                        f_temp.append(features[index, :])
                    else:
                        print("Warning: {} not found "
                              "in list of files.".format(row[0]))
                else:
                    print(
                        "Warning: Row with unknown format in regression file")

        f_final.append(np.array(f_temp))
        # cur_regression_labels is the list of values
        # for the current regression problem
        regression_labels.append(np.array(cur_regression_labels))
        # regression task name
        regression_names.append(ntpath.basename(c).replace(".csv", ""))
        if len(features) == 0:
            print("ERROR: No data found in any input folder!")
            return

    # TODO: ARRF WRITE????
    # STEP B: classifier Evaluation and Parameter Selection:
    if model_type == "svm" or model_type == "svm_rbf":
        model_params = np.array(
            [0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 5.0, 10.0])
    elif model_type == "randomforest":
        model_params = np.array([5, 10, 25, 50, 100])

    errors = []
    errors_base = []
    best_params = []

    for iRegression, r in enumerate(regression_names):
        # get optimal classifeir parameter:
        print("Regression task " + r)
        bestParam, error, berror = evaluate_regression(
            f_final[iRegression], regression_labels[iRegression], 100,
            model_type, model_params)
        errors.append(error)
        errors_base.append(berror)
        best_params.append(bestParam)
        print("Selected params: {0:.5f}".format(bestParam))

        features_norm, mean, std = normalize_features([f_final[iRegression]])

        # STEP C: Save the model to file
        if model_type == "svm":
            classifier, _ = train_svm_regression(
                features_norm[0], regression_labels[iRegression], bestParam)
        if model_type == "svm_rbf":
            classifier, _ = train_svm_regression(
                features_norm[0],
                regression_labels[iRegression],
                bestParam,
                kernel='rbf')
        if model_type == "randomforest":
            classifier, _ = train_random_forest_regression(
                features_norm[0], regression_labels[iRegression], bestParam)

        if model_type == "svm" or model_type == "svm_rbf" \
                or model_type == "randomforest":
            with open(model_name + "_" + r, 'wb') as fid:
                cPickle.dump(classifier, fid)
            save_path = model_name + "_" + r + "MEANS"
            save_parameters(save_path, mean, std, mid_window, mid_step,
                            short_window, short_step, compute_beat)

    return errors, errors_base, best_params
Exemplo n.º 24
0
def mtFileClassification(input_file, model_name, model_type,
                         plot_results=False, gt_file=""):
    """
    This function performs mid-term classification of an audio stream.
    Towards this end, supervised knowledge is used,
    i.e. a pre-trained classifier.
    ARGUMENTS:
        - input_file:        path of the input WAV file
        - model_name:        name of the classification model
        - model_type:        svm or knn depending on the classifier type
        - plot_results:      True if results are to be plotted using
                             matplotlib along with a set of statistics

    RETURNS:
          - segs:           a sequence of segment's endpoints: segs[i] is the
                            endpoint of the i-th segment (in seconds)
          - classes:        a sequence of class flags: class[i] is the
                            class ID of the i-th segment
    """

    if not os.path.isfile(model_name):
        print("mtFileClassificationError: input model_type not found!")
        return (-1, -1, -1, -1)
    # Load classifier:
    if model_type == "knn":
        [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step,
         compute_beat] = aT.load_model_knn(model_name)
    else:
        [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step,
         compute_beat] = aT.load_model(model_name)

    if compute_beat:
        print("Model " + model_name + " contains long-term music features "
                                      "(beat etc) and cannot be used in "
                                      "segmentation")
        return (-1, -1, -1, -1)
    [fs, x] = audioBasicIO.read_audio_file(input_file) # load input file
    if fs == -1:  # could not read file
        return (-1, -1, -1, -1)
    x = audioBasicIO.stereo_to_mono(x)  # convert stereo (if) to mono
    # mid-term feature extraction:
    [mt_feats, _, _] = aF.mid_feature_extraction(x, fs, mt_win * fs,
                                                 mt_step * fs,
                                                 round(fs * st_win),
                                                 round(fs * st_step))
    flags = []
    Ps = []
    flags_ind = []
    # for each feature vector (i.e. for each fix-sized segment):
    for i in range(mt_feats.shape[1]):
        cur_fv = (mt_feats[:, i] - MEAN) / STD  # normalize current feature v
        # classify vector:
        [res, P] = aT.classifierWrapper(classifier, model_type, cur_fv)
        flags_ind.append(res)
        flags.append(class_names[int(res)])  # update class label matrix
        Ps.append(np.max(P))   # update probability matrix
    flags_ind = np.array(flags_ind)
    # 1-window smoothing
    for i in range(1, len(flags_ind) - 1):
        if flags_ind[i-1] == flags_ind[i + 1]:
            flags_ind[i] = flags_ind[i + 1]
    # convert fix-sized flags to segments and classes
    (segs, classes) = flags2segs(flags, mt_step)
    segs[-1] = len(x) / float(fs)

    # Load grount-truth:        
    if os.path.isfile(gt_file):
        [seg_start_gt, seg_end_gt, seg_l_gt] = readSegmentGT(gt_file)
        flags_gt, class_names_gt = segs2flags(seg_start_gt, seg_end_gt,
                                              seg_l_gt, mt_step)
        flags_ind_gt = []
        for j, fl in enumerate(flags_gt):
            # "align" labels with GT
            if class_names_gt[flags_gt[j]] in class_names:
                flags_ind_gt.append(class_names.index(class_names_gt[
                                                          flags_gt[j]]))
            else:
                flags_ind_gt.append(-1)
        flags_ind_gt = np.array(flags_ind_gt)        
        cm = np.zeros((len(class_names_gt), len(class_names_gt)))
        for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])):
            cm[int(flags_ind_gt[i]),int(flags_ind[i])] += 1        
    else:
        cm = []
        flags_ind_gt = np.array([])
    acc = plotSegmentationResults(flags_ind, flags_ind_gt,
                                  class_names, mt_step, not plot_results)
    if acc >= 0:
        print("Overall Accuracy: {0:.3f}".format(acc)  )
        return (flags_ind, class_names_gt, acc, cm)
    else:
        return (flags_ind, class_names, acc, cm)
Exemplo n.º 25
0
#TODO
'''
Add number of Features
Add types of Features
CSV Headers(if Possible)
'''

from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import MidTermFeatures

# mid term features saved in csv for each individual song file
MidTermFeatures.mid_feature_extraction_file_dir(
    "D:/Capstone/Testing/New_directory",
    1.0,
    0.75,
    0.050,
    0.005,
    store_short_features=True,
    store_csv=True,
    plot=False)
# (folder_path, mid_window, mid_step,short_window, short_step,store_short_features=False, store_csv=False,plot=False)
Exemplo n.º 26
0
    values, labels = torch.max(test_result, 1)
    y_pred = labels.data.numpy()
    return f1_score(y_pred, test_labels)


from pyAudioAnalysis import MidTermFeatures as mt
from pyAudioAnalysis import audioTrainTest as aT
import numpy as np
import os

if os.path.isfile("features.npy"):
    with open('features.npy', 'rb') as f:
        X = np.load(f)
        y = np.load(f)
else:
    features, class_names, file_names = mt.multiple_directory_feature_extraction(
        ["audio/speech", "audio/noise"], 1, 1, 0.1, 0.1, False)
    X, y = aT.features_to_matrix(features)
    with open('features.npy', 'wb') as f:
        np.save(f, np.array(X))
        np.save(f, np.array(y))

dimensions = X.shape[1]

# Split to train/test
X_train = X[::2, :]
y_train = y[::2]
X_test = X[1::2, :]
y_test = y[1::2]

n_nodes = 256