Пример #1
0
def segment(data, seg_location, length):
    os.chdir(data)
    files = os.listdir(data)
    speakers = speech_data.get_speakers(data)
    waves = []
    num = {}
    for s in speakers:
        num[s] = 0
    c = 0
    for f in files:  # grab all wave files in list
        waves.append(audio.from_wav(f))
        c = c + 1
    os.chdir(seg_location)
    for f, w in zip(
            files,
            waves):  # need to segment the data into one second intervals
        begin = 0
        end = 1
        while (end * length) < int(w.duration_seconds):
            segment = w[begin * 1000 * length:end * 1000 * length]
            segment.export(
                speech_data.speaker(f) + '_' +
                str(num[speech_data.speaker(f)]) + '.wav', 'wav')
            begin = begin + length
            end = end + length
            num[speech_data.speaker(f)] = num[speech_data.speaker(f)] + 1
Пример #2
0
def main():
    speakers = data.get_speakers()
    number_classes = len(speakers)
    print("speakers", speakers)

    model = make_model(number_classes)
    model.load('classifier')

    stream = audio.Stream()
    while True:
        raw_input('press enter to record!!!')
        buff = stream.record(1.5)
        sample = audio.stream_to_ints(buff)
        test(model, speakers, sample)
Пример #3
0
def main():
    speakers = data.get_speakers()
    number_classes = len(speakers)
    print("speakers", speakers)
    # train(number_classes)
    # return
    model = ml.make_model(number_classes)
    model.load('classifier')

    stream = audio.Stream()
    while True:
        input('press enter to record!!!')
        buff = stream.record(1.5)
        sample = audio.stream_to_ints(buff)
        label, conf = ml.predict(model, speakers, sample)
        print("predicted : result = %s  confidence = %.2f" % (label, conf))
Пример #4
0
    def handle_speaker_rec_test_intent(self, message):
        speakers = data.get_speakers()
        number_classes = len(speakers)
        #print("speakers",speakers)

        #batch=data.wave_batch_generator(batch_size=1000, source=data.Source.DIGIT_WAVES, target=data.Target.speaker)
        #X,Y=next(batch)

        # Classification
        #tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5)

        net = tflearn.input_data(shape=[None, 8192])  #Two wave chunks
        net = tflearn.fully_connected(net, 64)
        net = tflearn.dropout(net, 0.5)
        net = tflearn.fully_connected(net,
                                      number_classes,
                                      activation='softmax')
        net = tflearn.regression(net,
                                 optimizer='adam',
                                 loss='categorical_crossentropy')

        model = tflearn.DNN(net)
        #model.fit(X, Y, n_epoch=100, show_metric=True, snapshot_step=100)

        CWD_PATH = os.path.dirname(__file__)
        path_to_model = os.path.join(CWD_PATH, 'model', 'model.tfl')
        model.load(path_to_model)

        demo_file = "8_Vicki_260.wav"
        #demo_file = "8_Bruce_260.wav"
        demo = data.load_wav_file(data.path + demo_file)
        result = model.predict([demo])
        result = data.one_hot_to_item(result, speakers)
        if result == "Vicki":
            self.speak("I am confident I'm speaking to %s" %
                       (result))  # ~ 97% correct
        else:
            self.speak("I'm sorry I don't recognize your voice")
#!/usr/bin/env PYTHONIOENCODING="utf-8" python
import tflearn
import pyaudio
import speech_data as data

# Simple speaker recognition demo, with 99% accuracy in under a minute ( on digits sample )

# | Adam | epoch: 030 | loss: 0.05330 - acc: 0.9966 -- iter: 0000/1000
# 'predicted speaker for 9_Vicki_260 : result = ', 'Vicki'

speakers = data.get_speakers()
number_classes = len(speakers)
##print("speakers",speakers)

WORD_WAVs = "spoken_words"
batch = data.wave_batch_generator(batch_size=1000,
                                  source=WORD_WAVs,
                                  target=data.Target.speaker)
X, Y = next(batch)

# Classification
tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5)

net = tflearn.input_data(shape=[None, 8192])  #Two wave chunks
net = tflearn.fully_connected(net, 64)
net = tflearn.dropout(net, 0.5)
net = tflearn.fully_connected(net, number_classes, activation='softmax')
net = tflearn.regression(net,
                         optimizer='adam',
                         loss='categorical_crossentropy')
import os
import sys
import librosa
import tflearn
import wave
import pickle
import tensorflow as tf
import librosa.display
import IPython.display
import numpy as np
import speech_data
from pydub import AudioSegment as audio

# now put all of the mfccs into an array
data = '/home/cc/working/data/devclean_2_seg/'
speakers = speech_data.get_speakers(data)
audio_files = os.listdir(data)
mfccs = []
Y = []
for f in audio_files:
  Y.append(speech_data.one_hot_from_item(speech_data.speaker(f), speakers))
  y, sr = librosa.load(data + f)
  mfccs.append(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13))

net = tflearn.input_data(shape=[None, 13, 44]) 
net = tflearn.fully_connected(net, 64)
net = tflearn.dropout(net, 0.5)
net = tflearn.fully_connected(net, 32)
net = tflearn.fully_connected(net, len(speakers), activation='softmax')
net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')
Пример #7
0
import os
import librosa
import tflearn
import speech_data
from pydub import AudioSegment as audio

speakers = speech_data.get_speakers('/home/cc/working/data/devclean_seg/')
net = tflearn.input_data(shape=[None, 13, 44])
net = tflearn.fully_connected(net, 64)
net = tflearn.dropout(net, 0.5)
net = tflearn.fully_connected(net, len(speakers), activation='softmax')
net = tflearn.regression(net,
                         optimizer='adam',
                         loss='categorical_crossentropy')
model = tflearn.DNN(net, tensorboard_verbose=3)
model.load('/home/cc/working/models/devclean/devclean_train.tflearn')
os.chdir('/home/cc/working/data/devclean_test/')
test = []
for f1 in os.listdir(os.getcwd()):
    y, sr = librosa.load(f1)
    test.append(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13))
result = model.predict(test)
c = 0
for f, r in zip(os.listdir(os.getcwd()), result):
    res = speech_data.one_hot_to_item(r, speakers)
    if res in f:
        c = c + 1
print('correct: %s ; total: %s' % (str(c), str(len(test))))
import os
import sys
import librosa
import tflearn
import wave
import tensorflow as tf
import librosa.display
import IPython.display
import numpy as np
import speech_data
from pydub import AudioSegment as audio

# now put all of the mfccs into an array
data = '/home/cc/working/data/devclean_seg/'
os.chdir(data)
speakers = speech_data.get_speakers(os.getcwd())
audio_files = os.listdir(os.getcwd())
mfccs = []
Y = []
for f in audio_files:
  Y.append(speech_data.one_hot_from_item(speech_data.speaker(f), speakers))
  y, sr = librosa.load(f)
  mfccs.append(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13))

net = tflearn.input_data(shape=[None, 13, 44]) 
net = tflearn.fully_connected(net, 64)
net = tflearn.dropout(net, 0.6)
net = tflearn.fully_connected(net, len(speakers), activation='softmax')
net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')

model = tflearn.DNN(net)
Пример #9
0
import numpy as np

# load constants - training directory, testing directory
training_seg = '/home/cc/Data/Dev-Clean-Train-Two/'
testing = '/home/cc/Data/Dev-Clean-Test-Two'

# size of fully connected layers
n = sys.argv[1]
#l = sys.argv[2]
m = 18
d = 0.8

# calculate the mfcc matrices for training from the segmented data
#X = []
#Y = []
speakers = speech_data.get_speakers(training_seg)
#for f in os.listdir(training_seg):
#    Y.append(speech_data.one_hot_from_item(speech_data.speaker(f), speakers))
#    y, sr = librosa.load(training_seg + f)
#    X.append(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=int(m)))

#pickle.dump(X, open('/home/cc/Data/pickle_files/mfcc_len/train' + str(m) + '_X.p', 'wb'))
#pickle.dump(Y, open('/home/cc/Data/pickle_files/mfcc_len/train' + str(m) + '_Y.p', 'wb'))
X = pickle.load(
    open('/home/cc/Data/pickle_files/mfcc_len/train' + str(m) + '_X.p', 'rb'))
Y = pickle.load(
    open('/home/cc/Data/pickle_files/mfcc_len/train' + str(m) + '_Y.p', 'rb'))

# define the network and the model for training
tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5)
Пример #10
0
import wave
import pickle
import speech_data
import segment_data
import tensorflow as tf
import librosa.display
import numpy as np

# load constants - training directory, testing directory
training = '/home/cc/Data/train/'
testing = '/home/cc/Data/test/'

# calculate the mfcc matrices for training from the segmented data
X = []
Y = []
speakers = speech_data.get_speakers(training)
for f in os.listdir(training):
    Y.append(speech_data.one_hot_from_item(speech_data.speaker(f), speakers))
    y, sr = librosa.load(training + f)
    mfcc = np.asarray(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20))
    X.append(mfcc)

# input size for fully connected layers
layer_size = int(sys.argv[1])
dropout = float(sys.argv[2])

# define the network and the model for training
tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5)

# for just mfcc
net = tflearn.input_data(shape=[None, 20, 87])
Пример #11
0
#!/usr/bin/env PYTHONIOENCODING="utf-8" python
import random
random.seed(5)

import tflearn
import os
import sys
import librosa
import speech_data as data
import numpy as np
import pickle
test_data = '/home/edresson/Pti-embbending/Encoder-MFCC/Automatizado/Bases/Segments-5s/Validacao/Base1/X/'
train_data = '/home/edresson/Pti-embbending/Encoder-MFCC/Automatizado/Bases/Segments-5s/Treino/Base1/X-2/'
working = ''
# grab the speakers from the training directory
speakers = data.get_speakers(train_data)
number_classes = len(speakers)
#print(number_classes,speakers)
# create the MFCC arrays from the data for training
audio_files = os.listdir(working + train_data)

X = []
Y = []

try:

    with open('rna-treino_X-5s.txt', 'rb') as f:
        X = pickle.load(f)

    with open('rna-treino_Y-5s.txt', 'rb') as f:
        Y = pickle.load(f)
#!/usr/bin/env PYTHONIOENCODING="utf-8" python
import tflearn
import pyaudio
import speech_data as data

# Simple speaker recognition demo, with 99% accuracy in under a minute ( on digits sample )

# | Adam | epoch: 030 | loss: 0.05330 - acc: 0.9966 -- iter: 0000/1000
# 'predicted speaker for 9_Vicki_260 : result = ', 'Vicki'

speakers = data.get_speakers()
number_classes=len(speakers)
print("speakers",speakers)

WORD_WAVs="spoken_words"
batch=data.wave_batch_generator(batch_size=1000,source=WORD_WAVs,target=data.Target.speaker)
X,Y=next(batch)


# Classification
tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5)

net = tflearn.input_data(shape=[None, 8192]) #Two wave chunks
net = tflearn.fully_connected(net, 64)
net = tflearn.dropout(net, 0.5)
net = tflearn.fully_connected(net, number_classes, activation='softmax')
net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')

model = tflearn.DNN(net)
model.fit(X, Y, n_epoch=100, show_metric=True, snapshot_step=100)
Пример #13
0
#!/usr/bin/env PYTHONIOENCODING="utf-8" python
import tflearn
import os
import speech_data as data

# training and testing data sets
train_data = '/home/cc/Data/small-clean-train/'
test_data = '/home/cc/Data/small-clean-test/'

# grab the speakers from the training directory
speakers = data.get_speakers(train_data)
number_classes = len(speakers)

# create the MFCC arrays from the data for training
batch=data.wave_batch_generator(batch_size=1000,source=WORD_WAVs,target=data.Target.speaker,speakers=speakers)
X,Y=next(batch)


# Classification
tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5)

net = tflearn.input_data(shape=[None, 8192]) #Two wave chunks
net = tflearn.fully_connected(net, 64)
# seems like a higher dropout rate works better -- why is this??
net = tflearn.dropout(net, 0.5)
net = tflearn.fully_connected(net, number_classes, activation='softmax')
net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')

model = tflearn.DNN(net)
model.fit(X, Y, n_epoch=100, show_metric=True, snapshot_step=100)