Пример #1
0
def classify():
    # load model
    try:
        model = keras.models.load_model('cnn_speech_trained_model.h5')
    except:
        print('Error: No model file found, please train the classifier first.')
        sys.exit()

    # load the evaluation data
    eval_data = list(wav16khz2mfcc(EVAL_DATA).items())

    # ...aaand classify the given data
    count = 0
    results = []
    for filename, data in eval_data:

        # compute the score of the picture
        cls, score = classify_record(model, data)
        results.append(filename.split('/')[-1].split('.')[0]
                + ' ' + str(score)
                + ' ' + str(cls))
        if cls: count += 1 # just remember we got a target...

    results.sort()
    output = open('audio_convolutionalNN.txt', 'w')
    for result in results: print(result, file=output)
    if PRINT_STATS: print(f'Targets found: {count}')
    output.close()
def loadEvalData():
    print("STEP4: Loading evaluation data.")

    evalData = wav16khz2mfcc(EVAL_DIR)

    print(
        "STEP4 Done: All evaluation data loaded and prepared for classification.\n"
    )

    return evalData
def loadTrainData():
    filenames = os.listdir(TRAIN_DIR)

    # Recordings loading
    print("STEP1: Loading training data into internal structure")

    for filename in filenames:
        classRecords = wav16khz2mfcc(TRAIN_DIR + "/" + filename).values()
        trainClasses[filename] = classRecords
        trainClasses[filename] = np.vstack(classRecords)

    print("STEP1 Done: Training data loaded. Number of trainig classes: ",
          len(trainClasses), "\n")

    return trainClasses
Пример #4
0
def main():
    check_dir('eval')
    test = wav16khz2mfcc('eval')
    P_t = 0.5
    P_nt = 1 - P_t
    fname = 'GMM_model.pkl'
    if len(sys.argv) > 1:
        fname = sys.argv[1]

    #choose one model
    with open(fname, 'rb') as f:
        Ws_t, MUs_t, COVs_t, Ws_nt, MUs_nt, COVs_nt = pickle.load(f)

    for tst in sorted(test.keys()):
        ll_t = logpdf_gmm(test[tst], Ws_t, MUs_t, COVs_t)
        ll_nt = logpdf_gmm(test[tst], Ws_nt, MUs_nt, COVs_nt)
        scr = (sum(ll_t) + np.log(P_t)) - (sum(ll_nt) + np.log(P_nt))
        tst = tst.split("/")[-1].split(".")[0]
        if scr >= 0:
            print(tst, scr, 1)
        else:
            print(tst, scr, 0)
Пример #5
0
    predictions = img_model.predict_proba(i_test_x)
    p = np.argmax(predictions)
    name = re.search('.*/(\w*).png', k).group(1)
    #get result and write to output file
    result = predictions[0][1] * 10
    result = float("{:.2f}".format(result))
    if result >= 1.0:
        output_file.write(k + " " + " " + str(result) + " " + "1\n")
    else:
        output_file.write(k + " " + " " + str(result) + " " + "0\n")
    #add prediction result to data dict for detecting person
    data[name] = [predictions[0][1]]

output_file.close()

v_data = wav16khz2mfcc(data_folder_path)
voice_model.summary()

output_file = open('./predict_voice_output.txt', 'w')

print("[INFO] evaluating voice classifier...")
a = []
count = 0
#average value of results
mean = 1020
for k, v in v_data.items():
    #prepare data for model
    v_test_x = np.vstack(tuple(v))
    v_test_x = np.r_[v_test_x]
    v_test_x = v_test_x.astype('float32')
    v_test_x /= 255
Пример #6
0
def main():
    check_dir(os.path.dirname(negative_test_path))
    check_dir(os.path.dirname(negative_train_path))
    check_dir(os.path.dirname(positive_test_path))
    check_dir(os.path.dirname(positive_train_path))

    train_m = list(wav16khz2mfcc(positive_train_path).values())
    train_f = list(wav16khz2mfcc(negative_train_path).values())
    test_m = list(wav16khz2mfcc(positive_test_path).values())
    test_f = list(wav16khz2mfcc(negative_test_path).values())

    train_m = np.vstack(train_m)
    train_f = np.vstack(train_f)
    dim = train_m.shape[1]

    cov_tot = np.cov(np.vstack([train_m, train_f]).T, bias=True)
    d, e = scipy.linalg.eigh(cov_tot, eigvals=(dim - 2, dim - 1))

    train_m_pca = train_m.dot(e)
    train_f_pca = train_f.dot(e)
    # Classes are not well separated in 2D PCA subspace

    n_m = len(train_m)
    n_f = len(train_f)
    cov_wc = (n_m * np.cov(train_m.T, bias=True) +
              n_f * np.cov(train_f.T, bias=True)) / (n_m + n_f)
    cov_ac = cov_tot - cov_wc
    d, e = scipy.linalg.eigh(cov_ac, cov_wc, eigvals=(dim - 1, dim - 1))

    # Lets define uniform a-priori probabilities of classes:
    P_m = 0.5
    P_f = 1 - P_m

    ll_m = logpdf_gauss(test_m[0], np.mean(train_m, axis=0),
                        np.var(train_m, axis=0))
    ll_f = logpdf_gauss(test_m[0], np.mean(train_f, axis=0),
                        np.var(train_f, axis=0))

    posterior_m = np.exp(ll_m) * P_m / (np.exp(ll_m) * P_m +
                                        np.exp(ll_f) * P_f)

    ll_m = logpdf_gauss(test_m[0], *train_gauss(train_m))
    ll_f = logpdf_gauss(test_m[0], *train_gauss(train_f))
    # '*' before 'train_gauss' pases both return values (mean and cov) as parameters of 'logpdf_gauss'
    posterior_m = np.exp(ll_m) * P_m / (np.exp(ll_m) * P_m +
                                        np.exp(ll_f) * P_f)
    plt.figure()
    plt.plot(posterior_m, 'b')
    plt.plot(1 - posterior_m, 'r')
    plt.figure()
    plt.plot(ll_m, 'b')
    plt.plot(ll_f, 'r')

    # Again gaussian models with full covariance matrices. Now testing a female utterance

    ll_m = logpdf_gauss(test_f[1], *train_gauss(train_m))
    ll_f = logpdf_gauss(test_f[1], *train_gauss(train_f))
    # '*' before 'train_gauss' pases both return values (mean and cov) as parameters of 'logpdf_gauss'
    posterior_m = np.exp(ll_m) * P_m / (np.exp(ll_m) * P_m +
                                        np.exp(ll_f) * P_f)
    plt.figure()
    plt.plot(posterior_m, 'b')
    plt.plot(1 - posterior_m, 'r')
    plt.figure()
    plt.plot(ll_m, 'b')
    plt.plot(ll_f, 'r')

    score = []
    mean_m, cov_m = train_gauss(train_m)
    mean_f, cov_f = train_gauss(train_f)
    for tst in test_m:
        ll_m = logpdf_gauss(tst, mean_m, cov_m)
        ll_f = logpdf_gauss(tst, mean_f, cov_f)
        score.append((sum(ll_m) + np.log(P_m)) - (sum(ll_f) + np.log(P_f)))

    # Run recognition with 1-dimensional LDA projected data
    score = []
    mean_m, cov_m = train_gauss(train_m.dot(e))
    mean_f, cov_f = train_gauss(train_f.dot(e))
    for tst in test_m:
        ll_m = logpdf_gauss(tst.dot(e), mean_m, np.atleast_2d(cov_m))
        ll_f = logpdf_gauss(tst.dot(e), mean_f, np.atleast_2d(cov_f))
        score.append((sum(ll_m) + np.log(P_m)) - (sum(ll_f) + np.log(P_f)))

    M_m = 12

    MUs_m = train_m[randint(1, len(train_m), M_m)]
    COVs_m = [np.var(train_m, axis=0)] * M_m
    Ws_m = np.ones(M_m) / M_m

    M_f = 7
    MUs_f = train_f[randint(1, len(train_f), M_f)]
    COVs_f = [np.var(train_f, axis=0)] * M_f
    Ws_f = np.ones(M_f) / M_f

    for jj in range(100):
        [Ws_m, MUs_m, COVs_m, TTL_m] = train_gmm(train_m, Ws_m, MUs_m, COVs_m)
        [Ws_f, MUs_f, COVs_f, TTL_f] = train_gmm(train_f, Ws_f, MUs_f, COVs_f)

    score = []
    testok = 0
    testnok = 0
    for tst in test_m:
        ll_m = logpdf_gmm(tst, Ws_m, MUs_m, COVs_m)
        ll_f = logpdf_gmm(tst, Ws_f, MUs_f, COVs_f)
        scr = (sum(ll_m) + np.log(P_m)) - (sum(ll_f) + np.log(P_f))
        score.append(scr)
        if scr >= 0:
            testok += 1
        else:
            testnok += 1
    print("target is " + str(testok / (testok + testnok)))

    score = []
    testok = 0
    testnok = 0
    for tst in test_f:
        ll_m = logpdf_gmm(tst, Ws_m, MUs_m, COVs_m)
        ll_f = logpdf_gmm(tst, Ws_f, MUs_f, COVs_f)
        scr = (sum(ll_m) + np.log(P_m)) - (sum(ll_f) + np.log(P_f))
        score.append(scr)
        if scr < 0:
            testok += 1
        else:
            testnok += 1
    print("non target is " + str(testok / (testok + testnok)))
    print('Saved as "GMM_model.pkl"')
    with open('GMM_model.pkl', 'wb') as f:
        pickle.dump([Ws_m, MUs_m, COVs_m, Ws_f, MUs_f, COVs_f], f)
Пример #7
0
import numpy as np
from ikrlib import wav16khz2mfcc, logistic_sigmoid, train_gauss, train_gmm, logpdf_gmm, gellipse
import matplotlib.pyplot as plt
import scipy.linalg
from numpy.random import randint

class_index = 0
train_sample = [None] * 2
test_sample = [None] * 2
real_test_sample = [None] * 2

for speech_sample in train_sample:
    train_sample[class_index] = wav16khz2mfcc('data/train/' +
                                              str(class_index + 1)).values()
    for rec in train_sample[class_index]:
        rec = rec[100:len(rec) - 300, :]
    class_index += 1

P_c1 = len(train_sample[0]) * 1.0 / len(train_sample[1])
P_c = [P_c1, 1 - P_c1]

real_test_sample_dict = wav16khz2mfcc('data/eval/')

real_test_sample = real_test_sample_dict.values()
real_test_name = real_test_sample_dict.keys()

# Cutting silence of a record - to get better results
for rec in real_test_sample:
    rec = rec[100:len(rec) - 300, :]

for class_index in xrange(len(train_sample)):
Пример #8
0
from __future__ import division
import numpy as np
from ikrlib import wav16khz2mfcc, train_gmm, logpdf_gmm

NUM_CLASSES = 31

if __name__ == "__main__":
    M = []
    Ws = []
    MUs = []
    COVs = []

    for i in range(NUM_CLASSES):
        id = i + 1
        print("Loading data for class {}".format(id))
        train = np.vstack(wav16khz2mfcc("train/{}".format(id)).values())

        print("Training model for class {}".format(id))
        M.append(32)
        Ws.append(np.ones(M[i]) / M[i])
        MUs.append(train[np.random.randint(1, len(train), M[i])])
        COVs.append([np.var(train, axis=0)] * M[i])

        n = 15
        for iteration in range(n):
            [Ws[i], MUs[i], COVs[i], TTL] = train_gmm(train, Ws[i], MUs[i], COVs[i])
            print("Training iteration: {}/{}, total log-likelihood: {}".format(iteration + 1, n, TTL))

    errors = 0
    trials = 0
    for i in range(NUM_CLASSES):
Пример #9
0
import matplotlib.pyplot as plt
from ikrlib import wav16khz2mfcc

import numpy as np

from keras import layers
from keras import models
from keras import optimizers
from keras import utils
from keras import metrics
import datetime
import time

# Load data
train_t = np.vstack(tuple(wav16khz2mfcc('../data/target_train').values()))

train_n = np.vstack(tuple(wav16khz2mfcc('../data/non_target_train').values()))

val_t = np.vstack(tuple(wav16khz2mfcc('../data/target_dev').values()))

val_n = np.vstack(tuple(wav16khz2mfcc('../data/non_target_dev').values()))

print('train_t: ', train_t.shape)
print('train_n: ', train_n.shape)
print('val_t: ', val_t.shape)
print('val_n: ', val_n.shape)

#preparing data
train_x = np.r_[train_t, train_n]
train_y = np.r_[np.ones(len(train_t)), np.zeros(len(train_n))]
Пример #10
0
import matplotlib.pyplot as plt
from ikrlib import wav16khz2mfcc, logpdf_gauss, train_gauss, train_gmm, logpdf_gmm
from glob import glob
import scipy.linalg
import numpy as np
from numpy.random import randint

train = {}
MUs = {}
COVs = {}
gauss_cnt = 10
Ws = {}
for i in range(1, 32):

    train[i] = wav16khz2mfcc('train/' + str(i)).values()

    #plt.plot(train[i][0][:,0])
    #plt.show()

    # DELETE FIRST AND LAST 200 FRAMES
    for j in range(0, len(train[i])):
        for k in range(0, 200):
            train[i][j] = np.delete(train[i][j], (0), axis=0)
        for k in range(0, 200):
            train[i][j] = np.delete(train[i][j], (len(train[i][j]) - 1),
                                    axis=0)

    # DELETE SILENCE frames
    for j in range(0, len(train[i])):
        summ = 0
        min_eng = train[i][j][0][0]
Пример #11
0
def train_classifier(validation=False):
    # First, load the data
    train_t = list(wav16khz2mfcc(TRAIN_TARGET).values()) # target train data
    train_n = list(wav16khz2mfcc(TRAIN_NTARGET).values()) # non-target train data

    if validation:
        test_t = wav16khz2mfcc(TEST_TARGET) # target test data
        test_n = wav16khz2mfcc(TEST_NTARGET) # non-target test data

    target = []
    for rec in train_t:
        target.append(remove_silence(rec))

    ntarget = []
    for rec in train_n:
        ntarget.append(remove_silence(rec))

    X_train_t = np.vstack(target)
    X_train_n = np.vstack(ntarget)

    if validation:
        test_target = []
        for rec in list(test_t.values()):
            test_target.append(remove_silence(rec))

        test_ntarget = []
        for rec in list(test_n.values()):
            test_ntarget.append(remove_silence(rec))

        X_test_t = np.vstack(test_target)
        X_test_n = np.vstack(test_ntarget)

    # Create 13x13 batches from the data
    X_train_t = create_frame_batches(X_train_t)
    X_train_n = create_frame_batches(X_train_n)

    if validation:
        X_test_t = create_frame_batches(X_test_t)
        X_test_n = create_frame_batches(X_test_n)

    # Get all the data to one place
    X_train = np.vstack((X_train_t, X_train_n))
    y_train = np.hstack((np.zeros(X_train_t.shape[0]), np.ones(X_train_n.shape[0])))
    y_train_hot = to_categorical(y_train)

    if validation:
        X_test = np.vstack((X_test_t, X_test_n))
        y_test = np.hstack((np.zeros(X_test_t.shape[0]), np.ones(X_test_n.shape[0])))
        y_test_hot = to_categorical(y_test)

    num_classes = 2

    # Now build the model
    model = Sequential()
    model.add(Conv2D(16, (3, 3),
        input_shape=(SEGMENT_LEN, 13, 1),
        activation='relu'))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Conv2D(32, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Flatten())
    model.add(Dropout(0.25))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(loss="categorical_crossentropy",
                      optimizer="adam",
                      metrics=['accuracy'])

    if validation:
        model.fit(X_train, y_train_hot, batch_size=5, epochs=5, validation_data=(X_test, y_test_hot))
    else:
        model.fit(X_train, y_train_hot, batch_size=5, epochs=5)

    model.summary()
    model.save('cnn_speech_trained_model.h5')
Пример #12
0
#!/usr/bin/python3.8
import numpy
import io
# slightly modified version of ikrlib
from ikrlib import wav16khz2mfcc, train_gmm, logpdf_gmm

t_train_sound__dict = wav16khz2mfcc("target_train")
t_train_sound = numpy.vstack(list(t_train_sound__dict.values()))
nt_train_sound__dict = wav16khz2mfcc("non_target_train")
nt_train_sound = numpy.vstack(list(wav16khz2mfcc("non_target_train").values()))
eval_sound__dict = wav16khz2mfcc("eval")
eval_sound = list(eval_sound__dict.values())
eval_sound_names = [x.split("\\")[1].split(".")[0]
                    for x in list(eval_sound__dict.keys())]

P_t = 0.5
M_t = 64
MUs_t = t_train_sound[numpy.random.randint(1, len(t_train_sound), M_t)]
COVs_t = [numpy.var(t_train_sound, axis=0)] * M_t
Ws_t = numpy.ones(M_t) / M_t

P_nt = 1 - P_t
M_nt = M_t
MUs_nt = nt_train_sound[numpy.random.randint(1, len(nt_train_sound), M_nt)]
COVs_nt = [numpy.var(nt_train_sound, axis=0)] * M_nt
Ws_nt = numpy.ones(M_nt) / M_nt

n = 32
for i in range(n):
    [Ws_t, MUs_t, COVs_t, TTL_t] = train_gmm(
        t_train_sound, Ws_t, MUs_t, COVs_t)