Пример #1
0
def train(config):
    train_set, dev_set, test_set = mod.SpeechDataset.splits(config)
    if config["input_file"]:
        model = mod.SpeechModel(config)
        model.load(config["input_file"])
    else:
        model = mod.SpeechModel(config)
    if not config["no_cuda"]:
        torch.cuda.set_device(config["gpu_no"])
        model.cuda()
    optimizer = torch.optim.SGD(model.parameters(), lr=config["lr"])
    criterion = nn.CrossEntropyLoss()
    min_loss = sys.float_info.max

    train_loader = data.DataLoader(train_set, batch_size=config["batch_size"], shuffle=True, drop_last=True)
    dev_loader = data.DataLoader(dev_set, batch_size=min(len(dev_set), 100), shuffle=True)
    test_loader = data.DataLoader(test_set, batch_size=min(len(test_set), 100), shuffle=True)
    step_no = 0

    for epoch_idx in range(config["n_epochs"]):
        for batch_idx, (model_in, labels) in enumerate(train_loader):
            model.train()
            optimizer.zero_grad()
            if not config["no_cuda"]:
                model_in = model_in.cuda()
                labels = labels.cuda()
            model_in = Variable(model_in, requires_grad=False)
            scores = model(model_in)
            labels = Variable(labels, requires_grad=False)
            loss = criterion(scores, labels)
            loss.backward()
            optimizer.step()
            step_no += 1
            print_eval("train step #{}".format(step_no), scores, labels, loss)

        if epoch_idx % config["dev_every"] == config["dev_every"] - 1:
            model.eval()
            for model_in, labels in dev_loader:
                model_in = Variable(model_in, requires_grad=False)
                if not config["no_cuda"]:
                    model_in = model_in.cuda()
                    labels = labels.cuda()
                scores = model(model_in)
                labels = Variable(labels, requires_grad=False)
                loss = criterion(scores, labels)
                loss_numeric = loss.cpu().data.numpy()[0]
                if loss_numeric < min_loss:
                    min_loss = loss_numeric
                    model.save(config["output_file"])
                print_eval("dev", scores, labels, loss)
    evaluate(config, model, test_loader)
Пример #2
0
def evaluate(config, model=None, test_loader=None):
    if not test_loader:
        _, _, test_set = mod.SpeechDataset.splits(config)
        test_loader = data.DataLoader(test_set, batch_size=len(test_set))
    if not config["no_cuda"]:
        torch.cuda.set_device(config["gpu_no"])
    if not model:
        model = mod.SpeechModel(config)
        model.load(config["input_file"])
    if not config["no_cuda"]:
        torch.cuda.set_device(config["gpu_no"])
        model.cuda()
    model.eval()
    criterion = nn.CrossEntropyLoss()
    results = []
    total = 0
    for model_in, labels in test_loader:
        model_in = Variable(model_in, requires_grad=False)
        if not config["no_cuda"]:
            model_in = model_in.cuda()
            labels = labels.cuda()
        scores = model(model_in)
        labels = Variable(labels, requires_grad=False)
        loss = criterion(scores, labels)
        results.append(print_eval("test", scores, labels, loss) * model_in.size(0))
        total += model_in.size(0)
    print("final test accuracy: {}".format(sum(results) / total))
Пример #3
0
 def __init__(self):
     self.CHUNK = 1024 * 3
     self.THRESHOLD = 0.1
     self.FORMAT = pyaudio.paFloat32
     self.CHANNELS = 1
     self.SAMPLE_RATE = 16000
     self.model = md.SpeechModel()
     self.p = pyaudio.PyAudio()
Пример #4
0
import model
import librosa
import os

path_name = "/home/minhhiu/MyProjects/Compressed Speech Data/full_command_data/test/wav/"

cmd_list = []
m = model.SpeechModel()
correct = 0
total = 0

with open("./model/cmd_list.txt", "r", encoding="utf8") as cmd_r:
    for i in range(71):
        cmd = cmd_r.readline()
        cmd = cmd.lower().strip("\n")
        cmd_list.append(cmd)

print(cmd_list)

for d in os.listdir(path_name):
    path = os.path.join(path_name, d)
    for file in os.listdir(path):
        wav_path = os.path.join(path, file)
        txt_path = wav_path.replace("wav", "txt")

        audio, _ = librosa.load(wav_path, sr=16000, mono=True)
        audio = audio.reshape(-1, 1)

        pred_index = m.predict(audio)

        with open(txt_path, "r", encoding="utf8") as fr:
Пример #5
0
n_coms = np.arange(3, 6)
n_mixs = np.arange(3, 6)
window_sizes = np.arange(0.015, 0.045, 0.01)
strides = np.arange(0.005, 0.015, 0.005)
n_mfccs = np.array([6, 9, 13])

params = itertools.product(n_coms, n_mixs, window_sizes, strides, n_mfccs)
max_setting = {}
max_acc = -np.inf

for n_com, n_mix, window_size, stride, n_mfcc in params:
    print('______')
    print('n_com = {n_com}, n_mix = {n_mix}, window_size = {window_size}, stride = {stride}, \
n_mfcc = {n_mfcc}'.format(n_com=n_com, n_mix=n_mix, window_size=window_size, stride=stride, n_mfcc=n_mfcc))
    data_loader = model.DataLoader(path='tiengviet', n_mfcc=n_mfcc, window_size=window_size,
                             overlap=stride, test_size=0.1 , shuffle = True)
    data_loader.load_data()
    m = model.SpeechModel(data_loader, n_com=n_com, n_mix=n_mix)
    m.fit()
    acc = m.cal_accuracy()
    print(acc)
    if acc > max_acc:
        max_acc = acc
        max_setting['n_com'] = n_com
        max_setting['n_mix'] = n_mix
        max_setting['window_size'] = window_size
        max_setting['stride'] = stride
        max_setting['n_mfcc'] = n_mfcc