def fileRegression(inputFile, model_name, model_type, feats=["gfcc", "mfcc"]): # Load classifier: if not os.path.isfile(inputFile): print("fileClassification: wav file not found!") return (-1, -1, -1) regression_models = glob.glob(model_name + "_*") regression_models2 = [] for r in regression_models: if r[-5::] != "MEANS": regression_models2.append(r) regression_models = regression_models2 regression_names = [] for r in regression_models: regression_names.append(r[r.rfind("_") + 1::]) # FEATURE EXTRACTION # LOAD ONLY THE FIRST MODEL (for mt_win, etc) if model_type == 'svm' or model_type == "svm_rbf" or model_type == 'randomforest': [_, _, _, mt_win, mt_step, st_win, st_step, compute_beat] = load_model(regression_models[0], True) [Fs, x] = audioBasicIO.readAudioFile( inputFile) # read audio file and convert to mono x = audioBasicIO.stereo2mono(x) # feature extraction: [mt_features, s, _] = aF.mtFeatureExtraction(x, Fs, mt_win * Fs, mt_step * Fs, round(Fs * st_win), round(Fs * st_step), feats) mt_features = mt_features.mean( axis=1) # long term averaging of mid-term statistics if compute_beat: [beat, beatConf] = aF.beatExtraction(s, st_step) mt_features = numpy.append(mt_features, beat) mt_features = numpy.append(mt_features, beatConf) # REGRESSION R = [] for ir, r in enumerate(regression_models): if not os.path.isfile(r): print("fileClassification: input model_name not found!") return (-1, -1, -1) if model_type == 'svm' or model_type == "svm_rbf" \ or model_type == 'randomforest': [model, MEAN, STD, mt_win, mt_step, st_win, st_step, compute_beat] = \ load_model(r, True) curFV = (mt_features - MEAN) / STD # normalization R.append(regressionWrapper(model, model_type, curFV)) # classification return R, regression_names
def __init__(self): self.BEAM_WIDTH = 500 self.LM_ALPHA = 0.75 self.LM_BETA = 1.85 self.model_dir = 'DeepSpeech/data/wernicke/model/' self.model_file = os.path.join(self.model_dir, 'output_graph.pb') # self.model_dir = 'deepspeech-0.6.0-models/' # self.model_file = os.path.join(self.model_dir, 'output_graph.pbmm') self.lm_file = os.path.join(self.model_dir, 'lm.binary') self.trie_file = os.path.join(self.model_dir, 'trie') self.save_dir = 'saved_wavs' os.makedirs(self.save_dir, exist_ok=True) # load segment model log.info('Initializing pyAudioAnalysis classifier model...') [ self.classifier, self.MEAN, self.STD, self.class_names, self.mt_win, self.mt_step, self.st_win, self.st_step, _ ] = aT.load_model("wernicke_server_model") self.fs = 16000 log.info('Initializing deepspeech model...') self.model = deepspeech.Model(self.model_file, self.BEAM_WIDTH) # Temporarily disabling this. I don't think I have nearly enough samples to start doing LM and trie files, etc self.model.enableDecoderWithLM(self.lm_file, self.trie_file, self.LM_ALPHA, self.LM_BETA) log.info('Models ready.')
def classifyFolderWrapper(inputFolder, modelType, modelName, outputMode=False): if not os.path.isfile(modelName): raise Exception("Input modelName not found!") if modelType=='svm': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, compute_beat] = aT.load_model(modelName) elif modelType=='knn': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, compute_beat] = aT.load_model_knn(modelName) PsAll = numpy.zeros((len(classNames), )) files = "*.wav" if os.path.isdir(inputFolder): strFilePattern = os.path.join(inputFolder, files) else: strFilePattern = inputFolder + files wavFilesList = [] wavFilesList.extend(glob.glob(strFilePattern)) wavFilesList = sorted(wavFilesList) if len(wavFilesList)==0: print ("No WAV files found!") return Results = [] for wavFile in wavFilesList: [Fs, x] = audioBasicIO.readAudioFile(wavFile) signalLength = x.shape[0] / float(Fs) [Result, P, classNames] = aT.fileClassification(wavFile, modelName, modelType) PsAll += (numpy.array(P) * signalLength) Result = int(Result) Results.append(Result) if outputMode: print ("{0:s}\t{1:s}".format(wavFile,classNames[Result])) Results = numpy.array(Results) # print distribution of classes: [Histogram, _] = numpy.histogram(Results, bins=numpy.arange(len(classNames)+1)) if outputMode: for i,h in enumerate(Histogram): print( "{0:20s}\t\t{1:d}".format(classNames[i], h)) PsAll = PsAll / numpy.sum(PsAll) if outputMode: fig = plt.figure() ax = fig.add_subplot(111) plt.title("Classes percentage " + inputFolder.replace('Segments','')) ax.axis((0, len(classNames)+1, 0, 1)) ax.set_xticks(numpy.array(range(len(classNames)+1))) ax.set_xticklabels([" "] + classNames) ax.bar(numpy.array(range(len(classNames)))+0.5, PsAll) plt.show() return classNames, PsAll
def fileClassification(inputFile, model_name, model_type, feats=["gfcc", "mfcc"]): # Load classifier: if not os.path.isfile(model_name): print("fileClassification: input model_name not found!") return (-1, -1, -1) if not os.path.isfile(inputFile): print("fileClassification: wav file not found!") return (-1, -1, -1) if model_type == 'knn': [ classifier, MEAN, STD, classNames, mt_win, mt_step, st_win, st_step, compute_beat ] = load_model_knn(model_name) else: [ classifier, MEAN, STD, classNames, mt_win, mt_step, st_win, st_step, compute_beat ] = load_model(model_name) [Fs, x] = audioBasicIO.readAudioFile( inputFile) # read audio file and convert to mono x = audioBasicIO.stereo2mono(x) if isinstance(x, int): # audio file IO problem return (-1, -1, -1) if x.shape[0] / float(Fs) <= mt_win: return (-1, -1, -1) # feature extraction: [mt_features, s, _] = aF.mtFeatureExtraction(x, Fs, mt_win * Fs, mt_step * Fs, round(Fs * st_win), round(Fs * st_step), feats) mt_features = mt_features.mean( axis=1) # long term averaging of mid-term statistics if compute_beat: [beat, beatConf] = aF.beatExtraction(s, st_step) mt_features = numpy.append(mt_features, beat) mt_features = numpy.append(mt_features, beatConf) curFV = (mt_features - MEAN) / STD # normalization [Result, P] = classifierWrapperHead(classifier, model_type, curFV) # classification return Result, P, classNames
def getMusicSegmentsFromFile(inputFile): modelType = "svm" modelName = "data/svmMovies8classes" dirOutput = inputFile[0:-4] + "_musicSegments" if os.path.exists(dirOutput) and dirOutput!=".": shutil.rmtree(dirOutput) os.makedirs(dirOutput) [Fs, x] = audioBasicIO.readAudioFile(inputFile) if modelType=='svm': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, compute_beat] = aT.load_model(modelName) elif modelType=='knn': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, compute_beat] = aT.load_model_knn(modelName) flagsInd, classNames, acc, CM = aS.mtFileClassification(inputFile, modelName, modelType, plotResults = False, gtFile = "") segs, classes = aS.flags2segs(flagsInd, mtStep) for i, s in enumerate(segs): if (classNames[int(classes[i])] == "Music") and (s[1] - s[0] >= minDuration): strOut = "{0:s}{1:.3f}-{2:.3f}.wav".format(dirOutput+os.sep, s[0], s[1]) wavfile.write( strOut, Fs, x[int(Fs*s[0]):int(Fs*s[1])])
def speaker_diarization(filename, n_speakers, mid_window=1.0, mid_step=0.1, short_window=0.1, lda_dim=0, plot_res=False): """ ARGUMENTS: - filename: the name of the WAV file to be analyzed - n_speakers the number of speakers (clusters) in the recording (<=0 for unknown) - mid_window (opt) mid-term window size - mid_step (opt) mid-term window step - short_window (opt) short-term window size - lda_dim (opt LDA dimension (0 for no LDA) - plot_res (opt) 0 for not plotting the results 1 for plotting """ sampling_rate, signal = audioBasicIO.read_audio_file(filename) signal = audioBasicIO.stereo_to_mono(signal) duration = len(signal) / sampling_rate base_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/models") classifier_all, mean_all, std_all, class_names_all, _, _, _, _, _ = \ at.load_model(os.path.join(base_dir, "svm_rbf_speaker_10")) classifier_fm, mean_fm, std_fm, class_names_fm, _, _, _, _, _ = \ at.load_model(os.path.join(base_dir, "svm_rbf_speaker_male_female")) mid_feats, st_feats, a = \ mtf.mid_feature_extraction(signal, sampling_rate, mid_window * sampling_rate, mid_step * sampling_rate, round(sampling_rate * 0.05), round(sampling_rate * 0.05)) mid_term_features = np.zeros((mid_feats.shape[0] + len(class_names_all) + len(class_names_fm), mid_feats.shape[1])) for index in range(mid_feats.shape[1]): feature_norm_all = (mid_feats[:, index] - mean_all) / std_all feature_norm_fm = (mid_feats[:, index] - mean_fm) / std_fm _, p1 = at.classifier_wrapper(classifier_all, "svm_rbf", feature_norm_all) _, p2 = at.classifier_wrapper(classifier_fm, "svm_rbf", feature_norm_fm) start = mid_feats.shape[0] end = mid_feats.shape[0] + len(class_names_all) mid_term_features[0:mid_feats.shape[0], index] = mid_feats[:, index] mid_term_features[start:end, index] = p1 + 1e-4 mid_term_features[end::, index] = p2 + 1e-4 # normalize features: scaler = StandardScaler() mid_feats_norm = scaler.fit_transform(mid_term_features.T) # remove outliers: dist_all = np.sum(distance.squareform(distance.pdist(mid_feats_norm.T)), axis=0) m_dist_all = np.mean(dist_all) i_non_outliers = np.nonzero(dist_all < 1.1 * m_dist_all)[0] # TODO: Combine energy threshold for outlier removal: # EnergyMin = np.min(mt_feats[1,:]) # EnergyMean = np.mean(mt_feats[1,:]) # Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 # i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0] # print i_non_outliers mt_feats_norm_or = mid_feats_norm mid_feats_norm = mid_feats_norm[:, i_non_outliers] # LDA dimensionality reduction: if lda_dim > 0: # extract mid-term features with minimum step: window_ratio = int(round(mid_window / short_window)) step_ratio = int(round(short_window / short_window)) mt_feats_to_red = [] num_of_features = len(st_feats) num_of_stats = 2 for index in range(num_of_stats * num_of_features): mt_feats_to_red.append([]) # for each of the short-term features: for index in range(num_of_features): cur_pos = 0 feat_len = len(st_feats[index]) while cur_pos < feat_len: n1 = cur_pos n2 = cur_pos + window_ratio if n2 > feat_len: n2 = feat_len short_features = st_feats[index][n1:n2] mt_feats_to_red[index].append(np.mean(short_features)) mt_feats_to_red[index + num_of_features].\ append(np.std(short_features)) cur_pos += step_ratio mt_feats_to_red = np.array(mt_feats_to_red) mt_feats_to_red_2 = np.zeros((mt_feats_to_red.shape[0] + len(class_names_all) + len(class_names_fm), mt_feats_to_red.shape[1])) limit = mt_feats_to_red.shape[0] + len(class_names_all) for index in range(mt_feats_to_red.shape[1]): feature_norm_all = (mt_feats_to_red[:, index] - mean_all) / std_all feature_norm_fm = (mt_feats_to_red[:, index] - mean_fm) / std_fm _, p1 = at.classifier_wrapper(classifier_all, "svm_rbf", feature_norm_all) _, p2 = at.classifier_wrapper(classifier_fm, "svm_rbf", feature_norm_fm) mt_feats_to_red_2[0:mt_feats_to_red.shape[0], index] = \ mt_feats_to_red[:, index] mt_feats_to_red_2[mt_feats_to_red.shape[0]:limit, index] = p1 + 1e-4 mt_feats_to_red_2[limit::, index] = p2 + 1e-4 mt_feats_to_red = mt_feats_to_red_2 scaler = StandardScaler() mt_feats_to_red = scaler.fit_transform(mt_feats_to_red.T).T labels = np.zeros((mt_feats_to_red.shape[1], )) lda_step = 1.0 lda_step_ratio = lda_step / short_window for index in range(labels.shape[0]): labels[index] = int(index * short_window / lda_step_ratio) clf = sklearn.discriminant_analysis.\ LinearDiscriminantAnalysis(n_components=lda_dim) mid_feats_norm = clf.fit_transform(mt_feats_to_red.T, labels) #clf.fit(mt_feats_to_red.T, labels) #mid_feats_norm = (clf.transform(mid_feats_norm.T)).T if n_speakers <= 0: s_range = range(2, 10) else: s_range = [n_speakers] cluster_labels = [] sil_all = [] cluster_centers = [] for speakers in s_range: k_means = sklearn.cluster.KMeans(n_clusters=speakers) k_means.fit(mid_feats_norm) cls = k_means.labels_ cluster_labels.append(cls) # cluster_centers.append(means) sil_1 = []; sil_2 = [] for c in range(speakers): # for each speaker (i.e. for each extracted cluster) clust_per_cent = np.nonzero(cls == c)[0].shape[0] / float(len(cls)) if clust_per_cent < 0.020: sil_1.append(0.0) sil_2.append(0.0) else: # get subset of feature vectors mt_feats_norm_temp = mid_feats_norm[cls == c, :] # compute average distance between samples # that belong to the cluster (a values) dist = distance.pdist(mt_feats_norm_temp.T) sil_1.append(np.mean(dist)*clust_per_cent) sil_temp = [] for c2 in range(speakers): # compute distances from samples of other clusters if c2 != c: clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\ float(len(cls)) mid_features_temp = mid_feats_norm[cls == c2, :] dist = distance.cdist(mt_feats_norm_temp, mid_features_temp) sil_temp.append(np.mean(dist)*(clust_per_cent + clust_per_cent_2)/2.0) sil_temp = np.array(sil_temp) # ... and keep the minimum value (i.e. # the distance from the "nearest" cluster) sil_2.append(min(sil_temp)) sil_1 = np.array(sil_1) sil_2 = np.array(sil_2) sil = [] for c in range(speakers): # for each cluster (speaker) compute silhouette sil.append((sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 1e-5)) # keep the AVERAGE SILLOUETTE sil_all.append(np.mean(sil)) imax = int(np.argmax(sil_all)) # optimal number of clusters num_speakers = s_range[imax] # generate the final set of cluster labels # (important: need to retrieve the outlier windows: # this is achieved by giving them the value of their # nearest non-outlier window) # print(cls) # cls = np.zeros((n_wins,)) # for index in range(n_wins): # j = np.argmin(np.abs(index-i_non_outliers)) # cls[index] = cluster_labels[imax][j] # Post-process method 1: hmm smoothing if lda_dim <= 0 : for index in range(1): # hmm training start_prob, transmat, means, cov = \ train_hmm_compute_statistics(mt_feats_norm_or.T, cls) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means; hmm.covars_ = cov cls = hmm.predict(mt_feats_norm_or) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 5) class_names = ["speaker{0:d}".format(c) for c in range(num_speakers)] # load ground-truth if available gt_file = filename.replace('.wav', '.segments') # if groundtruth exists if os.path.isfile(gt_file): seg_start, seg_end, seg_labs = read_segmentation_gt(gt_file) flags_gt, class_names_gt = segments_to_labels(seg_start, seg_end, seg_labs, mid_step) if plot_res: fig = plt.figure() if n_speakers > 0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(np.array(range(len(class_names)))) ax1.axis((0, duration, -1, len(class_names))) ax1.set_yticklabels(class_names) ax1.plot(np.array(range(len(cls))) * mid_step + mid_step / 2.0, cls) purity_cluster_m, purity_speaker_m = -1, -1 if os.path.isfile(gt_file): if plot_res: ax1.plot(np.array(range(len(flags_gt))) * mid_step + mid_step / 2.0, flags_gt, 'r') purity_cluster_m, purity_speaker_m = \ evaluate_speaker_diarization(cls, flags_gt) print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.title("Cluster purity: {0:.1f}% - " "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.xlabel("time (seconds)") if n_speakers <= 0: plt.subplot(212) plt.plot(s_range, sil_all) plt.xlabel("number of clusters") plt.ylabel("average clustering's sillouette") plt.show() return cls, purity_cluster_m, purity_speaker_m
def mtFileClassification(input_file, model_name, model_type, plot_results=False, gt_file=""): ''' This function performs mid-term classification of an audio stream. Towards this end, supervised knowledge is used, i.e. a pre-trained classifier. ARGUMENTS: - input_file: path of the input WAV file - model_name: name of the classification model - model_type: svm or knn depending on the classifier type - plot_results: True if results are to be plotted using matplotlib along with a set of statistics RETURNS: - segs: a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds) - classes: a sequence of class flags: class[i] is the class ID of the i-th segment ''' if not os.path.isfile(model_name): print("mtFileClassificationError: input model_type not found!") return (-1, -1, -1, -1) # Load classifier: if model_type == "knn": [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat] = \ aT.load_model_knn(model_name) else: [ classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat ] = aT.load_model(model_name) if compute_beat: print("Model " + model_name + " contains long-term music features " "(beat etc) and cannot be used in " "segmentation") return (-1, -1, -1, -1) [fs, x] = audioBasicIO.readAudioFile(input_file) # load input file if fs == -1: # could not read file return (-1, -1, -1, -1) x = audioBasicIO.stereo2mono(x) # convert stereo (if) to mono duration = len(x) / fs # mid-term feature extraction: [mt_feats, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs, mt_step * fs, round(fs * st_win), round(fs * st_step)) flags = [] Ps = [] flags_ind = [] for i in range( mt_feats.shape[1] ): # for each feature vector (i.e. for each fix-sized segment): cur_fv = (mt_feats[:, i] - MEAN) / STD # normalize current feature vector [res, P] = aT.classifierWrapper(classifier, model_type, cur_fv) # classify vector flags_ind.append(res) flags.append(class_names[int(res)]) # update class label matrix Ps.append(numpy.max(P)) # update probability matrix flags_ind = numpy.array(flags_ind) # 1-window smoothing for i in range(1, len(flags_ind) - 1): if flags_ind[i - 1] == flags_ind[i + 1]: flags_ind[i] = flags_ind[i + 1] # convert fix-sized flags to segments and classes (segs, classes) = flags2segs(flags, mt_step) segs[-1] = len(x) / float(fs) # Load grount-truth: if os.path.isfile(gt_file): [seg_start_gt, seg_end_gt, seg_l_gt] = readSegmentGT(gt_file) flags_gt, class_names_gt = segs2flags(seg_start_gt, seg_end_gt, seg_l_gt, mt_step) flags_ind_gt = [] for j, fl in enumerate(flags_gt): # "align" labels with GT if class_names_gt[flags_gt[j]] in class_names: flags_ind_gt.append( class_names.index(class_names_gt[flags_gt[j]])) else: flags_ind_gt.append(-1) flags_ind_gt = numpy.array(flags_ind_gt) cm = numpy.zeros((len(class_names_gt), len(class_names_gt))) for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])): cm[int(flags_ind_gt[i]), int(flags_ind[i])] += 1 else: cm = [] flags_ind_gt = numpy.array([]) acc = plotSegmentationResults(flags_ind, flags_ind_gt, class_names, mt_step, not plot_results) if acc >= 0: print("Overall Accuracy: {0:.3f}".format(acc)) return (flags_ind, class_names_gt, acc, cm) else: return (flags_ind, class_names, acc, cm)
def mid_term_file_classification(input_file, model_name, model_type, plot_results=False, gt_file=""): """ This function performs mid-term classification of an audio stream. Towards this end, supervised knowledge is used, i.e. a pre-trained classifier. ARGUMENTS: - input_file: path of the input WAV file - model_name: name of the classification model - model_type: svm or knn depending on the classifier type - plot_results: True if results are to be plotted using matplotlib along with a set of statistics RETURNS: - segs: a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds) - classes: a sequence of class flags: class[i] is the class ID of the i-th segment """ labels = [] accuracy = 0.0 class_names = [] cm = np.array([]) if not os.path.isfile(model_name): print("mtFileClassificationError: input model_type not found!") return labels, class_names, accuracy, cm # Load classifier: if model_type == "knn": classifier, mean, std, class_names, mt_win, mid_step, st_win, \ st_step, compute_beat = at.load_model_knn(model_name) else: classifier, mean, std, class_names, mt_win, mid_step, st_win, \ st_step, compute_beat = at.load_model(model_name) if compute_beat: print("Model " + model_name + " contains long-term music features " "(beat etc) and cannot be used in " "segmentation") return labels, class_names, accuracy, cm # load input file sampling_rate, signal = audioBasicIO.read_audio_file(input_file) # could not read file if sampling_rate == 0: return labels, class_names, accuracy, cm # convert stereo (if) to mono signal = audioBasicIO.stereo_to_mono(signal) # mid-term feature extraction: mt_feats, _, _ = \ mtf.mid_feature_extraction(signal, sampling_rate, mt_win * sampling_rate, mid_step * sampling_rate, round(sampling_rate * st_win), round(sampling_rate * st_step)) posterior_matrix = [] # for each feature vector (i.e. for each fix-sized segment): for col_index in range(mt_feats.shape[1]): # normalize current feature v feature_vector = (mt_feats[:, col_index] - mean) / std # classify vector: label_predicted, posterior = \ at.classifier_wrapper(classifier, model_type, feature_vector) labels.append(label_predicted) # update probability matrix posterior_matrix.append(np.max(posterior)) labels = np.array(labels) # convert fix-sized flags to segments and classes segs, classes = labels_to_segments(labels, mid_step) segs[-1] = len(signal) / float(sampling_rate) # Load grount-truth: labels_gt, class_names_gt, accuracy, cm = \ load_ground_truth(gt_file, labels, class_names, mid_step, plot_results) return labels, class_names, accuracy, cm
from google.cloud.speech import enums from google.cloud.speech import types classes = ['speech', 'music'] wf = wave.open('./data/01_Radioaufnahmen_Musik_Jingle_Sprache_mono2.wav', 'rb') print('sample rate: ' + str(wf.getframerate() / 1000.0) + 'kHz') print('channels: ' + str(wf.getnchannels())) print('initialize pocketsphinx...') sd = SpeechDetector.SpeechDetector() print('... done.') print('initialize speech/music classifier...') [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.load_model("./data/speech_music_classifier/svmSM") modelType = "svm" print('... done.') print('initialize genre classifier...') genre_recognizer = GenreRecognizer.GenreRecognizer( './data/genre_classifier/model.yaml', './data/genre_classifier/weights.h5') print('... done.') p = pyaudio.PyAudio() stream = p.open(format=p.get_format_from_width(wf.getsampwidth()), channels=wf.getnchannels(), rate=wf.getframerate(), output=True) import time
def vadFolderWrapperMergedByTh(inputFolder, outFolder, smoothingWindow, weight, model_name, threshold): if not os.path.isfile(model_name): print("fileClassification: input model_name not found!") classifier, mean, std, classes, mid_window, mid_step, short_window, \ short_step, compute_beat = aT.load_model(model_name) types = ('*.wav', '*.mp3') wavFilesList = [] for files in types: print(inputFolder + files) wavFilesList.extend(glob.glob((inputFolder + files))) wavFilesList = sorted(wavFilesList) if len(wavFilesList) == 0: print("No WAV files found!") return for wavFile in wavFilesList: # print(wavFile) if not os.path.isfile(wavFile): raise Exception("Input audio file not found!") base = os.path.splitext(os.path.basename(wavFile))[0] folder = outFolder + base + '/' if not os.path.exists(folder): os.makedirs(folder) segfile = open(os.path.join(folder, 'segments'), 'w+') segfile2 = open(os.path.join(folder, 'segments_details'), 'w+') stack = deque() [fs, x] = audioBasicIO.read_audio_file(wavFile) segmentLimits = aS.silence_removal(x, fs, 0.05, 0.05, smoothingWindow, weight, False) merge=True for i, st in enumerate(segmentLimits): signal = audioBasicIO.stereo_to_mono(x[int(fs * st[0]):int(fs * st[1])]) # print('in here', len(segmentLimits), st[0],st[1],classes, type(st)) if fs == 0: continue # audio file IO problem # return -1, -1, -1 if signal.shape[0] / float(fs) < mid_window: mid_window = signal.shape[0] / float(fs) # feature extraction: mid_features, s, _ = \ aF.mid_feature_extraction(signal, fs, mid_window * fs, mid_step * fs, round(fs * short_window), round(fs * short_step)) # long term averaging of mid-term statistics mid_features = mid_features.mean(axis=1) if compute_beat: # print('in here3') beat, beat_conf = aF.beat_extraction(s, short_step) mid_features = np.append(mid_features, beat) mid_features = np.append(mid_features, beat_conf) feature_vector = (mid_features - mean) / std # normalization # class_id = -1 # probability = -1 class_id = classifier.predict(feature_vector.reshape(1, -1))[0] # probability = classifier.predict_proba(feature_vector.reshape(1, -1))[0] print(class_id, type(class_id)) label=classes[int(class_id)] print(label) if label=='speech': dur=st[1]-st[0] # print('in hereas') if merge == True: seg_prev=[] # print('in hereasq12') if len(stack) >0: seg_prev = stack.pop() if len(seg_prev) >0 and st[1]-seg_prev[0] > threshold: # print('in hereas4') seg = [st[0], st[1], label] stack.append(seg_prev) stack.append(seg) merge = True elif len(seg_prev) >0: # print('in hereasqw345') seg = [seg_prev[0], st[1], label] stack.append(seg) merge = True else: seg = [st[0], st[1], label] stack.append(seg) merge = True else: # print('in hereas2') seg = [st[0], st[1], label] stack.append(seg) merge = True else: merge = False print(i, merge) # print(len(segmentLimits), len(stack)) for sn in stack: # print(type(wavFile), sn[0].shape, sn[1].shape, type(sn[0]), type(sn[1])) strName = base + "_" + "{:.3f}".format(sn[0]) + "_" + "{:.3f}".format(sn[1]) if sn[2] == 'speech': strOut = folder + base + "_" + "{:.3f}".format(sn[0]) + "_" + "{:.3f}".format(sn[1]) + ".wav" wavfile.write(strOut, fs, x[int(fs * sn[0]):int(fs * sn[1])]) segfile.write(strName + ' ' + base + ' ' + "{:.3f}".format(sn[0]) + ' ' + "{:.3f}".format(sn[1]) + "\n") segfile2.write(strName + ' ' + "{:.3f}".format(sn[0]) + ' ' + "{:.3f}".format(sn[1]) + ' ' + sn[2] + "\n") segfile.close() segfile2.close()
def FileClassification(input_file, model_name, model_type, gt=False, gt_file=""): ''' TODO: This function needs to be refactored according to the code in audioSegmentation.mid_term_file_classification() ''' if not os.path.isfile(model_name): print("mtFileClassificationError: input model_type not found!") return (-1, -1, -1, -1) # Load classifier with load_model: [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat] = aT.load_model(model_name) # Using audioBasicIO from puAudioAnalysis, the input audio stream is loaded [fs, x] = audioBasicIO.read_audio_file(input_file) if fs == -1: # could not read file return (-1, -1, -1, -1) x = audioBasicIO.stereo_to_mono(x) # convert stereo (if) to mono duration = len(x) / fs # mid-term feature extraction using pyAudioAnalysis mtFeatureExtraction: [mt_feats, _, _] = mF.mid_feature_extraction(x, fs, mt_win * fs, mt_step * fs, round(fs * st_win), round(fs * st_step)) flags = [] Ps = [] flags_ind = [] for i in range(mt_feats.shape[1]): # for each feature vector (i.e. for each fix-sized segment): cur_fv = (mt_feats[:, i] - MEAN) / STD [res, P] = aT.classifier_wrapper(classifier, model_type, cur_fv) if res == 0.0: if numpy.max(P) > 0.5: flags_ind.append(res) flags.append(class_names[int(res)]) # update class label matrix Ps.append(numpy.max(P)) # update probability matrix else: flags_ind.append(-1) flags.append('None') Ps.append(-1) if res == 1.0: if numpy.max(P) > 0.9: flags_ind.append(res) flags.append(class_names[int(res)]) # update class label matrix Ps.append(numpy.max(P)) # update probability matrix else: flags_ind.append(-1) flags.append('None') Ps.append(-1) if res == 2.0: if numpy.max(P) > 0.6: flags_ind.append(res) flags.append(class_names[int(res)]) # update class label matrix Ps.append(numpy.max(P)) # update probability matrix else: flags_ind.append(-1) flags.append('None') Ps.append(-1) if res == 3.0: if numpy.max(P) > 0.3: flags_ind.append(res) flags.append(class_names[int(res)]) # update class label matrix Ps.append(numpy.max(P)) # update probability matrix else: flags_ind.append(-1) flags.append('None') Ps.append(-1) if res == 4.0: if numpy.max(P) > 0.3: flags_ind.append(res) flags.append(class_names[int(res)]) # update class label matrix Ps.append(numpy.max(P)) # update probability matrix else: flags_ind.append(-1) flags.append('None') Ps.append(-1) flags_ind = numpy.array(flags_ind) # 1-window smoothing for i in range(1, len(flags_ind) - 1): if flags_ind[i - 1] == flags_ind[i + 1]: flags_ind[i] = flags_ind[i + 1] # convert fix-sized flags to segments and classes (segs, classes) = aS.labels_to_segments(flags, mt_step) segs[-1] = len(x) / float(fs) if gt == True: # Load grount-truth: if os.path.isfile(gt_file): [seg_start_gt, seg_end_gt, seg_l_gt] = aS.read_segmentation_gt(gt_file) flags_gt, class_names_gt = aS.segments_to_labels(seg_start_gt, seg_end_gt, seg_l_gt, mt_step) flags_ind_gt = [] # print(class_names) for j, fl in enumerate(flags_gt): # "align" labels with GT # print(class_names_gt[flags_gt[j]]) if class_names_gt[flags_gt[j]] in class_names: flags_ind_gt.append(class_names.index(class_names_gt[flags_gt[j]])) else: flags_ind_gt.append(-1) flags_ind_gt = numpy.array(flags_ind_gt) cm = numpy.zeros((len(class_names_gt), len(class_names_gt))) for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])): cm[int(flags_ind_gt[i]), int(flags_ind[i])] += 1 else: cm = [] flags_ind_gt = numpy.array([]) acc = aS.plot_segmentation_results(flags_ind, flags_ind_gt, class_names, mt_step, False) else: cm = [] flags_ind_gt = numpy.array([]) acc = aS.plot_segmentation_results(flags_ind, flags_ind_gt, class_names, mt_step, False) if acc >= 0: print("Overall Accuracy: {0:.3f}".format(acc)) return (flags_ind, class_names_gt, acc, cm) else: return (flags_ind, class_names, acc, cm)
def get_coordinates_from_audio(block, rms_min_max=[0, 25000]): mid_buf = [] global all_data global outstr all_data = [] outstr = datetime.datetime.now().strftime("%Y_%m_%d_%I:%M%p") # load segment model [classifier, mu, std, class_names, mt_win, mt_step, st_win, st_step, _] = aT.load_model("model") [clf_energy, mu_energy, std_energy, class_names_energy, mt_win_en, mt_step_en, st_win_en, st_step_en, _] = \ aT.load_model("energy") [clf_valence, mu_valence, std_valence, class_names_valence, mt_win_va, mt_step_va, st_win_va, st_step_va, _] = \ aT.load_model("valence") count_b = len(block) / 2 format_h = "%dh" % (count_b) shorts = struct.unpack(format_h, block) cur_win = list(shorts) mid_buf = mid_buf + cur_win del cur_win # data-driven time x = numpy.int16(mid_buf) seg_len = len(x) r = audioop.rms(x, 2) if r < rms_min_max[0]: # set new min incase the default value is exceded rms_min_max[0] = r if r > rms_min_max[1]: # set new max incase the default value is exceded rms_min_max[1] = r r_norm = float(r - rms_min_max[0]) / float(rms_min_max[1] - rms_min_max[0]) r_map = int(r_norm * 255) print( f'RMS: {r}; MIN: {rms_min_max[0]}; MAX: {rms_min_max[1]}; NORM: {r_norm}; MAP: {r_map}' ) # extract features # We are using the signal length as mid term window and step, # in order to guarantee a mid-term feature sequence of len 1 [mt_f, _, _] = mF(x, fs, seg_len, seg_len, round(fs * st_win), round(fs * st_step)) fv = (mt_f[:, 0] - mu) / std # classify vector: [res, prob] = aT.classifier_wrapper(classifier, "svm_rbf", fv) win_class = class_names[int(res)] if prob[class_names.index("silence")] > 0.8: soft_valence = 0 soft_energy = 0 print("Silence") else: # extract features for music mood [f_2, _, _] = mF(x, fs, round(fs * mt_win_en), round(fs * mt_step_en), round(fs * st_win_en), round(fs * st_step_en)) [f_3, _, _] = mF(x, fs, round(fs * mt_win_va), round(fs * mt_step_va), round(fs * st_win_va), round(fs * st_step_va)) # normalize feature vector fv_2 = (f_2[:, 0] - mu_energy) / std_energy fv_3 = (f_3[:, 0] - mu_valence) / std_valence [res_energy, p_en] = aT.classifier_wrapper(clf_energy, "svm_rbf", fv_2) win_class_energy = class_names_energy[int(res_energy)] [res_valence, p_val] = aT.classifier_wrapper(clf_valence, "svm_rbf", fv_3) win_class_valence = class_names_valence[int(res_valence)] soft_energy = p_en[class_names_energy.index("high")] - \ p_en[class_names_energy.index("low")] soft_valence = p_val[class_names_valence.index("positive")] - \ p_val[class_names_valence.index("negative")] print(win_class, win_class_energy, win_class_valence, soft_valence, soft_energy) global prev_valence_and_energy if prev_valence_and_energy == None: prev_valence_and_energy = (soft_valence, soft_energy) valence_difference = abs(prev_valence_and_energy[0] - soft_valence) energy_difference = abs(prev_valence_and_energy[1] - soft_energy) bound = 0.2 should_change = valence_difference > bound or energy_difference > bound all_data += mid_buf mid_buf = [] h, w, _ = img y_center, x_center = int(h / 2), int(w / 2) x = x_center + int( (w / 2) * soft_valence if not should_change else prev_valence_and_energy[0]) y = y_center - int( (h / 2) * soft_energy if not should_change else prev_valence_and_energy[1]) if should_change: prev_valence_and_energy = (soft_valence, soft_energy) radius = 20 alpha = format(r_map, '02x') return [soft_valence, soft_energy, alpha]
def mid_term_file_classification(input_file, model_name, model_type, plot_results=False): """ This function performs mid-term classification of an audio stream. Towards this end, supervised knowledge is used, i.e. a pre-trained classifier. ARGUMENTS: - input_file: path of the input WAV file - model_name: name of the classification model - model_type: svm or knn depending on the classifier type - plot_results: True if results are to be plotted using matplotlib along with a set of statistics RETURNS: - segs: a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds) - classes: a sequence of class flags: class[i] is the class ID of the i-th segment """ labels = [] accuracy = 0.0 class_names = [] cm = np.array([]) # print("model_name: ", model_name) if not os.path.isfile(model_name): # print("mtFileClassificationError: input model_type not found!") return labels, class_names, accuracy, cm # print("class_names: ", class_names) classifier, mean, std, class_names, mt_win, mid_step, st_win, \ st_step, compute_beat = at.load_model(model_name) # load input file sampling_rate, signal = audioBasicIO.read_audio_file(input_file) print("signal: ", signal.shape) # convert stereo (if) to mono signal = audioBasicIO.stereo_to_mono(signal) # mid-term feature extraction: mt_feats, _, _ = mtf.mid_feature_extraction(signal, sampling_rate, mt_win * sampling_rate, mid_step * sampling_rate, round(sampling_rate * st_win), round(sampling_rate * st_step)) class_probabilities = [] # print("class_names: ", class_names) # for each feature vector (i.e. for each fix-sized segment): for col_index in range(mt_feats.shape[1]): # normalize current feature v feature_vector = (mt_feats[:, col_index] - mean) / std # print("col_index: ", col_index) # classify vector: label_predicted, prob = at.classifier_wrapper(classifier, model_type, feature_vector) labels.append(label_predicted) # update probability matrix class_probabilities.append(prob) labels = np.array(labels) return labels, class_names, mid_step, class_probabilities
def record_audio(block_size, fs=8000, show_spec=False, show_chroma=False, log_sounds=False, logs_all=False): # inialize recording process mid_buf_size = int(fs * block_size) pa = pyaudio.PyAudio() stream = pa.open(format=FORMAT, channels=1, rate=fs, input=True, frames_per_buffer=mid_buf_size) mid_buf = [] count = 0 global all_data global outstr all_data = [] # initalize counters etc time_start = time.time() outstr = datetime.datetime.now().strftime("%Y_%m_%d_%I:%M%p") out_folder = outstr + "_segments" if log_sounds: if not os.path.exists(out_folder): os.makedirs(out_folder) # load segment model [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, _] = aT.load_model("model") while 1: try: block = stream.read(mid_buf_size) count_b = len(block) / 2 format = "%dh" % (count_b) shorts = struct.unpack(format, block) cur_win = list(shorts) mid_buf = mid_buf + cur_win del cur_win # time since recording started: e_time = (time.time() - time_start) # data-driven time data_time = (count + 1) * block_size x = numpy.int16(mid_buf) seg_len = len(x) # extract features # We are using the signal length as mid term window and step, # in order to guarantee a mid-term feature sequence of len 1 [mt_feats, _, _] = mF.mid_feature_extraction(x, fs, seg_len, seg_len, round(fs * st_win), round(fs * st_step)) cur_fv = (mt_feats[:, 0] - MEAN) / STD # classify vector: [res, prob] = aT.classifier_wrapper(classifier, "svm_rbf", cur_fv) win_class = class_names[int(res)] win_prob = prob[int(res)] if logs_all: all_data += mid_buf mid_buf = numpy.double(mid_buf) # Compute spectrogram if show_spec: (spec, t_axis, freq_axis_s) = sF.spectrogram(mid_buf, fs, 0.050 * fs, 0.050 * fs) freq_axis_s = numpy.array(freq_axis_s) # frequency axis # most dominant frequencies (for each short-term window): dominant_freqs = freq_axis_s[numpy.argmax(spec, axis=1)] # get average most dominant freq max_freq = numpy.mean(dominant_freqs) max_freq_std = numpy.std(dominant_freqs) # Compute chromagram if show_chroma: (chrom, TimeAxisC, freq_axis_c) = sF.chromagram(mid_buf, fs, 0.050 * fs, 0.050 * fs) freq_axis_c = numpy.array(freq_axis_c) # most dominant chroma classes: dominant_freqs_c = freq_axis_c[numpy.argmax(chrom, axis=1)] # get most common among all short-term windows max_freqC = most_common(dominant_freqs_c)[0] # Plot signal window signalPlotCV = plotCV( scipy.signal.resample(mid_buf + 16000, plot_w), plot_w, plot_h, 32000) cv2.imshow('Signal', signalPlotCV) cv2.moveWindow('Signal', 50, status_h + 50) # Show spectrogram if show_spec: i_spec = numpy.array(spec.T * 255, dtype=numpy.uint8) i_spec2 = cv2.resize(i_spec, (plot_w, plot_h), interpolation=cv2.INTER_CUBIC) i_spec2 = cv2.applyColorMap(i_spec2, cv2.COLORMAP_JET) cv2.putText(i_spec2, "max_freq: %.0f Hz" % max_freq, (0, 11), cv2.FONT_HERSHEY_PLAIN, 1, (200, 200, 200)) cv2.imshow('Spectrogram', i_spec2) cv2.moveWindow('Spectrogram', 50, plot_h + status_h + 60) # Show chromagram if show_chroma: i_chroma = numpy.array((chrom.T / chrom.max()) * 255, dtype=numpy.uint8) i_chroma2 = cv2.resize(i_chroma, (plot_w, plot_h), interpolation=cv2.INTER_CUBIC) i_chroma2 = cv2.applyColorMap(i_chroma2, cv2.COLORMAP_JET) cv2.putText(i_chroma2, "max_freqC: %s" % max_freqC, (0, 11), cv2.FONT_HERSHEY_PLAIN, 1, (200, 200, 200)) cv2.imshow('Chroma', i_chroma2) cv2.moveWindow('Chroma', 50, 2 * plot_h + status_h + 60) # Activity Detection: print("{0:.2f}\t{1:s}\t{2:.2f}".format(e_time, win_class, win_prob)) if log_sounds: # TODO: log audio files out_file = os.path.join( out_folder, "{0:.2f}_".format(e_time).zfill(8) + win_class + ".wav") #shutil.copyfile("temp.wav", out_file) wavfile.write(out_file, fs, x) textIm = numpy.zeros((status_h, plot_w, 3)) statusStrTime = "time: %.1f sec" % e_time + \ " - data time: %.1f sec" % data_time + \ " - loss : %.1f sec" % (e_time - data_time) cv2.putText(textIm, statusStrTime, (0, 11), cv2.FONT_HERSHEY_PLAIN, 1, (200, 200, 200)) cv2.putText(textIm, win_class, (0, 33), cv2.FONT_HERSHEY_PLAIN, 1, (0, 0, 255)) cv2.imshow("Status", textIm) cv2.moveWindow("Status", 50, 0) mid_buf = [] ch = cv2.waitKey(10) count += 1 except IOError: print("Error recording")
def emotion_from_speech(Fs, x, log, model_name="pyAudioAnalysis/pyAudioAnalysis/data/svmSpeechEmotion", model_type="svm"): """ :param Fs: frame rate :param x: data :param model_name: :param model_type: :param log: :return: """ regression_models = glob.glob(model_name + "_*") regression_models2 = [] for r in regression_models: if r[-5::] != "MEANS": regression_models2.append(r) regression_models = regression_models2 regression_names = [] for r in regression_models: regression_names.append(r[r.rfind("_")+1::]) emotion = {"valence": None, "arousal":None} # Feature extraction x = np.fromstring(x, np.int16) if model_type == 'svm' or model_type == "svm_rbf" or model_type == 'randomforest': [_, _, _, mt_win, mt_step, st_win, st_step, compute_beat] = aT.load_model(regression_models[0], True) else: return emotion [mt_features, s, _] = aF.mtFeatureExtraction(x, Fs, mt_win * Fs, mt_step * Fs, round(Fs * st_win), round(Fs * st_step)) mt_features = mt_features.mean(axis=1) # long term averaging of mid-term statistics if compute_beat: [beat, beatConf] = aF.beatExtraction(s, st_step) mt_features = np.append(mt_features, beat) mt_features = np.append(mt_features, beatConf) # Regression R = [] for ir, r in enumerate(regression_models): if not os.path.isfile(r): print("fileClassification: input model_name not found!") return emotion if model_type == 'svm' or model_type == "svm_rbf" or model_type == 'randomforest': [model, MEAN, STD, mt_win, mt_step, st_win, st_step, compute_beat] = aT.load_model(r, True) curFV = (mt_features - MEAN) / STD # normalization R.append(aT.regressionWrapper(model, model_type, curFV)) if R[0] > 1: log.warning("Valence > 1") emotion["valence"] = 1 elif R[0] < -1: log.warning("Valence < -1") emotion["valence"] = -1 else: emotion["valence"] = R[0] if R[1] > 1: log.warning("Arousal > 1") emotion["arousal"] = 1 elif R[1] < -1: log.warning("Arousal < -1") emotion["arousal"] = -1 else: emotion["arousal"] = R[1] return emotion
def record_audio(block_size, devices, use_yeelight_bulbs=False, fs=8000): # initialize the yeelight devices: bulbs = [] if use_yeelight_bulbs: for d in devices: bulbs.append(Bulb(d)) try: bulbs[-1].turn_on() except: bulbs = [] # initialize recording process mid_buf_size = int(fs * block_size) pa = pyaudio.PyAudio() stream = pa.open(format=FORMAT, channels=1, rate=fs, input=True, frames_per_buffer=mid_buf_size) mid_buf = [] count = 0 global all_data global outstr all_data = [] outstr = datetime.datetime.now().strftime("%Y_%m_%d_%I:%M%p") # load segment model [classifier, mu, std, class_names, mt_win, mt_step, st_win, st_step, _] = aT.load_model("model") [clf_energy, mu_energy, std_energy, class_names_energy, mt_win_en, mt_step_en, st_win_en, st_step_en, _] = \ aT.load_model("energy") [clf_valence, mu_valence, std_valence, class_names_valence, mt_win_va, mt_step_va, st_win_va, st_step_va, _] = \ aT.load_model("valence") while 1: block = stream.read(mid_buf_size) count_b = len(block) / 2 format = "%dh" % (count_b) shorts = struct.unpack(format, block) cur_win = list(shorts) mid_buf = mid_buf + cur_win del cur_win if len(mid_buf) >= 5 * fs: # data-driven time x = numpy.int16(mid_buf) seg_len = len(x) # extract features # We are using the signal length as mid term window and step, # in order to guarantee a mid-term feature sequence of len 1 [mt_f, _, _] = mF(x, fs, seg_len, seg_len, round(fs * st_win), round(fs * st_step)) fv = (mt_f[:, 0] - mu) / std # classify vector: [res, prob] = aT.classifier_wrapper(classifier, "svm_rbf", fv) win_class = class_names[int(res)] if prob[class_names.index("silence")] > 0.8: soft_valence = 0 soft_energy = 0 print("Silence") else: # extract features for music mood [f_2, _, _] = mF(x, fs, round(fs * mt_win_en), round(fs * mt_step_en), round(fs * st_win_en), round(fs * st_step_en)) [f_3, _, _] = mF(x, fs, round(fs * mt_win_va), round(fs * mt_step_va), round(fs * st_win_va), round(fs * st_step_va)) # normalize feature vector fv_2 = (f_2[:, 0] - mu_energy) / std_energy fv_3 = (f_3[:, 0] - mu_valence) / std_valence [res_energy, p_en] = aT.classifier_wrapper(clf_energy, "svm_rbf", fv_2) win_class_energy = class_names_energy[int(res_energy)] [res_valence, p_val] = aT.classifier_wrapper(clf_valence, "svm_rbf", fv_3) win_class_valence = class_names_valence[int(res_valence)] soft_energy = p_en[class_names_energy.index("high")] - \ p_en[class_names_energy.index("low")] soft_valence = p_val[class_names_valence.index("positive")] - \ p_val[class_names_valence.index("negative")] print(win_class, win_class_energy, win_class_valence, soft_valence, soft_energy) all_data += mid_buf mid_buf = [] h, w, _ = img.shape y_center, x_center = int(h / 2), int(w / 2) x = x_center + int((w / 2) * soft_valence) y = y_center - int((h / 2) * soft_energy) radius = 20 emo_map_img_2 = emo_map_img.copy() color = numpy.median(emo_map[y - 2:y + 2, x - 2:x + 2], axis=0).mean(axis=0) emo_map_img_2 = cv2.circle( emo_map_img_2, (x, y), radius, (int(color[0]), int(color[1]), int(color[2])), -1) emo_map_img_2 = cv2.circle(emo_map_img_2, (x, y), radius, (255, 255, 255), 2) cv2.imshow('Emotion Color Map', emo_map_img_2) # set yeelight bulb colors if use_yeelight_bulbs: for b in bulbs: if b: # attention: color is in bgr so we need to invert: b.set_rgb(int(color[2]), int(color[1]), int(color[0])) cv2.waitKey(10) count += 1
def mtFileClassification(inputFile, modelName, modelType, plotResults=False, gtFile="", return_for_user=False): ''' This function performs mid-term classification of an audio stream. Towards this end, supervised knowledge is used, i.e. a pre-trained classifier. ARGUMENTS: - inputFile: path of the input WAV file - modelName: name of the classification model - modelType: svm or knn depending on the classifier type - plotResults: True if results are to be plotted using matplotlib along with a set of statistics RETURNS: - segs: a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds) - classes: a sequence of class flags: class[i] is the class ID of the i-th segment ''' if not os.path.isfile(modelName): print("mtFileClassificationError: input modelType not found!") return (-1, -1, -1, -1) # Load classifier: if modelType == "knn": [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = \ aT.load_model_knn(modelName) else: [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = aT.load_model(modelName) if computeBEAT: print("Model " + modelName + " contains long-term music features " "(beat etc) and cannot be used in " "segmentation") return (-1, -1, -1, -1) [Fs, x] = audioBasicIO.readAudioFile(inputFile) # load input file if Fs == -1: # could not read file return (-1, -1, -1, -1) x = audioBasicIO.stereo2mono(x) # convert stereo (if) to mono Duration = len(x) / Fs # mid-term feature extraction: [MidTermFeatures, _] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep)) flags = [] Ps = [] flagsInd = [] for i in range( MidTermFeatures.shape[1] ): # for each feature vector (i.e. for each fix-sized segment): curFV = (MidTermFeatures[:, i] - MEAN) / STD # normalize current feature vector [Result, P] = aT.classifierWrapper(Classifier, modelType, curFV) # classify vector flagsInd.append(Result) flags.append(classNames[int(Result)]) # update class label matrix Ps.append(numpy.max(P)) # update probability matrix flagsInd = numpy.array(flagsInd) # 1-window smoothing for i in range(1, len(flagsInd) - 1): if flagsInd[i - 1] == flagsInd[i + 1]: flagsInd[i] = flagsInd[i + 1] flags[i] = flags[i + 1] (segs, classes) = flags2segs( flags, mtStep) # convert fix-sized flags to segments and classes segs[-1, 1] = len(x) / float(Fs) # Load grount-truth: if os.path.isfile(gtFile): [segStartGT, segEndGT, segLabelsGT] = readSegmentGT(gtFile) flagsGT, classNamesGT = segs2flags(segStartGT, segEndGT, segLabelsGT, mtStep) flagsIndGT = [] for j, fl in enumerate(flagsGT): # "align" labels with GT if classNamesGT[flagsGT[j]] in classNames: flagsIndGT.append(classNames.index(classNamesGT[flagsGT[j]])) else: flagsIndGT.append(-1) flagsIndGT = numpy.array(flagsIndGT) CM = numpy.zeros((len(classNamesGT), len(classNamesGT))) for i in range(min(flagsInd.shape[0], flagsIndGT.shape[0])): CM[int(flagsIndGT[i]), int(flagsInd[i])] += 1 else: CM = [] flagsIndGT = numpy.array([]) acc = plotSegmentationResults(flagsInd, flagsIndGT, classNames, mtStep, not plotResults) if acc >= 0: print("Overall Accuracy: {0:.3f}".format(acc)) if return_for_user: return {'segments': segs, 'classes': classes, 'accuracy': acc} else: return (flagsInd, classNamesGT, acc, CM) else: if return_for_user: return {'segments': segs, 'classes': classes} else: return (flagsInd, classNames, acc, CM)