Пример #1
0
from vosk import Model, KaldiRecognizer
from os import path
from pyaudio import PyAudio, paInt16
# https://github.com/alphacep/vosk-api/blob/master/doc/models.md

P = PyAudio()
stream = P.open(format=paInt16,
                channels=1,
                rate=16000,
                input=True,
                frames_per_buffer=8000)
stream.start_stream()
model = Model('_model')
rec = KaldiRecognizer(model, 16000)

print('Привет, я Баря!')

while True:
    data = stream.read(2000)
    if not len(data):
        break
    if rec.AcceptWaveform(data):
        text = eval(rec.Result())
        if text.get('text'):
            print(text.get('text'))
#     else:
#         text = eval(rec.PartialResult())
#         if text.get('partial'):
#             print(text['partial'])

#print(rec.FinalResult())
Пример #2
0
 def __init__(self):
     self.model = Model(vosk_model_path)
Пример #3
0
if not os.path.exists(model_path):
    print ("Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as {} in the current folder.".format(model_path))
    exit (1)

if not os.path.exists(spk_model_path):
    print ("Please download the speaker model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as {} in the current folder.".format(spk_model_path))
    exit (1)

wf = wave.open(sys.argv[1], "rb")
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
    print ("Audio file must be WAV format mono PCM.")
    exit (1)

# Large vocabulary free form recognition
model = Model(model_path)
spk_model = SpkModel(spk_model_path)
rec = KaldiRecognizer(model, spk_model, wf.getframerate())

# We compare speakers with cosine distance. We can keep one or several fingerprints for the speaker in a database
# to distingusih among users.
spk_sig = [5.64308, 4.23898, 1.119433, -0.810904, 2.115443, 2.328436, 6.135152, 1.348195, 2.60771, 1.020717, 4.324225, -0.873012, 6.123375, 4.903791, 0.064803, 4.66212, 3.502724, 2.535861, 5.452417, 7.081769, -0.823969, -5.167974, 8.568919, 4.159035, 5.314441, 3.688272, 5.730379, 4.463213, 7.227232, 3.538961, 3.316218, 1.269628, -1.902378, 3.512679, -1.947611, -1.520158, 3.80928, -2.721601, 5.359588, 2.942463, -7.474174, 3.788054, 0.303426, 4.951366, 1.72281, -1.867125, -3.574615, 3.622509, 4.803109, 2.829714, 1.528521, 6.408293, 0.820131, 5.066522, 2.836125, 2.867029, 3.725267, 0.505927, 1.462984, 5.001863, -3.838309, -2.45902, 3.992581, 4.451616, 2.865211, -1.148313, 4.996399, -3.473454, 2.876967, 3.940124, 7.553079, 0.373356, 1.396561, 2.686691, 2.094895, 0.913796, -0.286909, 3.540179, 4.904687, 0.84554, 7.585956, 1.017081, 0.168355, 6.672327, 4.092033, -4.240158, -2.017081, -0.813043, 6.468298, 4.115041, 2.231936, 2.370055, 4.972295, 5.58382, 6.022872, 2.706988, 5.248096, -1.918003, 8.259204, -0.900911, 1.961962, 2.349709, 3.290093, 3.344172, 3.307027, 4.203372, -0.315103, 5.61919, -3.229496, 3.777309, 4.328595, 1.461014, 2.622894, 0.315525, 5.447259, 5.407609, 5.339016, 1.604555, 5.359932, 0.090242, 0.535306, 4.724705, 4.692502, 0.5783, -5.436688, -4.915511, 1.959807, 2.825248]

def cosine_dist(x, y):
    nx = np.array(x)
    ny = np.array(y)
    return 1 - np.dot(nx, ny) / np.linalg.norm(nx) / np.linalg.norm(ny)

while True:
    data = wf.readframes(4000)
    if len(data) == 0:
Пример #4
0
vosk_sample_rate = float(os.environ.get('VOSK_SAMPLE_RATE', 8000))
spk_model_path = os.environ.get('VOSK_SPK_PATH',
                                '/opt/vosk-model-es/model-spk')

if len(sys.argv) > 1:
    vosk_model_path = sys.argv[1]

# Gpu part, uncomment if vosk-api has gpu support
#
# from vosk import GpuInit, GpuInstantiate
# GpuInit()
# def thread_init():
#     GpuInstantiate()
# pool = concurrent.futures.ThreadPoolExecutor(initializer=thread_init)

model = Model(vosk_model_path)
spk_model = SpkModel(spk_model_path)
pool = concurrent.futures.ThreadPoolExecutor((os.cpu_count() or 1))
loop = asyncio.get_event_loop()


def process_chunk(rec, message):
    if message == '{"eof" : 1}':
        return rec.FinalResult(), True
    elif rec.AcceptWaveform(message):
        return rec.Result(), False
    else:
        return rec.PartialResult(), False


async def recognize(websocket, path):
Пример #5
0
 def __init__(self):
     self.model_path = "./model"
     self.set_up()
     self.model = Model(self.model_path)
Пример #6
0
spk_model_path = "model-spk"

if not os.path.exists(spk_model_path):
    print(
        "Please download the speaker model from https://alphacephei.com/vosk/models and unpack as {} in the current folder."
        .format(spk_model_path))
    exit(1)

wf = wave.open(sys.argv[1], "rb")
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype(
) != "NONE":
    print("Audio file must be WAV format mono PCM.")
    exit(1)

# Large vocabulary free form recognition
model = Model(lang="en-us")
spk_model = SpkModel(spk_model_path)
#rec = KaldiRecognizer(model, wf.getframerate(), spk_model)
rec = KaldiRecognizer(model, wf.getframerate())
rec.SetSpkModel(spk_model)

# We compare speakers with cosine distance. We can keep one or several fingerprints for the speaker in a database
# to distingusih among users.
spk_sig = [
    -1.110417, 0.09703002, 1.35658, 0.7798632, -0.305457, -0.339204, 0.6186931,
    -0.4521213, 0.3982236, -0.004530723, 0.7651616, 0.6500852, -0.6664245,
    0.1361499, 0.1358056, -0.2887807, -0.1280468, -0.8208137, -1.620276,
    -0.4628615, 0.7870904, -0.105754, 0.9739769, -0.3258137, -0.7322628,
    -0.6212429, -0.5531687, -0.7796484, 0.7035915, 1.056094, -0.4941756,
    -0.6521456, -0.2238328, -0.003737517, 0.2165709, 1.200186, -0.7737719,
    0.492015, 1.16058, 0.6135428, -0.7183084, 0.3153541, 0.3458071, -1.418189,
Пример #7
0
def main(args):
    if len(args) != 2:
        sys.stderr.write(
            'Usage: analyze.py <path to audio file> <n_clusters>\n')
        sys.exit(1)
    """
    Initialize Config
    input:
    n_clusters: Integer set by a user
    text_processor: by default it is set to nltk.stem.snowball.SnowballStemmer
    sample_rate: by default set to 16 kHz due to ASR model specs
    aggressivness: required for VAD, by default set to maximum=3 as audiofiles are long
    """
    config = Config(n_clusters=int(args[1]))

    print(
        "If you want to check any specific target vocabulary, please type them\n",
        "Ex.: train, dog, work, seventeen, Brazil\n",
        "Otherwise, hit enter to skip")

    try:
        lesson_vocabulary = input().lower()
    except SyntaxError:
        pass

    lesson = LessonSegment(
        lesson_vocabulary,  # target_vocabulary
        read_audio(args[0], config.sample_rate)  # audio to get pcm_data
    )

    # update lesson dictionary to collect statistics
    lesson.update_dictionary(config.text_processor)

    # VAD
    vad = webrtcvad.Vad(config.aggressivness)
    frames = frame_generator(30, lesson.bytes, config.sample_rate)
    frames = list(frames)
    segments = vad_collector(config.sample_rate, 10, 150, vad, frames)

    # ASR
    asr = KaldiRecognizer(Model("model"), config.sample_rate)

    # store LessonSegment instances
    lesson_segments = []
    # store static tempo and pitch of each LessonSegment
    features = []
    for segment in segments:
        seg = LessonSegment('', segment)
        seg.transcribe(asr)
        features.append(seg.get_features(config.sample_rate))
        lesson_segments.append(seg)

    # Clustering
    features = MinMaxScaler().fit_transform(np.array(features))
    cl = GaussianMixture(n_components=config.n_clusters,
                         covariance_type='full')
    clusters = cl.fit_predict(features)

    # Resegmentation - create empty n*LessonSegments
    segments = [LessonSegment('', b'') for n in range(config.n_clusters)]
    for i, cluster in enumerate(clusters):
        cluster = int(cluster)
        segments[cluster].bytes += lesson_segments[i].bytes
        segments[cluster].transcript.extend(lesson_segments[i].transcript)

    [segment.get_staistics(lesson.dictionary) for segment in segments]

    for i, segment in enumerate(segments):
        path = 'resegmentation/cluster-%002d.mp3' % (i, )
        print('Writing %s' % (path, ))
        write_audio(path, segment.bytes, config.sample_rate)
        print("\n", segment.statistics, "\n")
Пример #8
0
# View & Screen example
view = View()
stream = open('queue.json', 'wt')
p = pyaudio.PyAudio()
CHANNELS = 1
RATE = 16000
CHUNK = 8000
audio_stream = p.open(format=pyaudio.paInt16,
                      channels=CHANNELS,
                      rate=RATE,
                      input=True,
                      frames_per_buffer=CHUNK)
audio_stream.start_stream()

model = Model("models/ru")
rec = KaldiRecognizer(model, RATE)
phrase = []


def get_product_id(name):
    for key in product_codes:
        if fuzz.ratio(key, name) >= 70:
            return product_codes[key]
    return -1


while True:
    data = audio_stream.read(CHUNK)
    if len(data) == 0:
        break
Пример #9
0
def main():
    flag_verbose = True
    model_path_kaldi_local = os.path.join('data', 'models', 'kaldi-ru-0.6')
    folder_record = os.path.join('/', 'mnt', 'monitor', '2020-01-28')
    #### dict users #########################
    users = {'903*******': 'user1', '903*******': 'user2'}
    ########################################
    path_cities_all = os.path.join('data', 'pop_cities_all_2019.xlsx')
    path_privet_wav = os.path.join('data', 'sounds', 'privet.wav')
    path_repeat_answer_wav = os.path.join('data', 'sounds',
                                          'repeat_answer.wav')
    path_what_city_answer_wav = os.path.join('data', 'sounds', 'what_city.wav')
    path_bye_wav = os.path.join('data', 'sounds', 'bye.wav')
    step_duration = 0.5
    sample_rate, sample_size_bytes = 8000, 2
    model_vosk = Model(
        os.path.join(
            'data', 'models',
            'alphacep-model-android-ru-0.3'))  #хотя модель только на 16 kz
    kaldi_rec_vosk = KaldiRecognizer(model_vosk, sample_rate)
    energy_threshold_in = 100  # для человека кто отвечает на звонок
    energy_threshold_out = 1  # запись или google tts
    output_folder = 'output'
    if not os.path.exists(output_folder):
        os.mkdir(output_folder)
    writer = pd.ExcelWriter(
        os.path.join(output_folder,
                     folder_record.split('/')[-1] + '_.xlsx'))
    wav_info = WavTools()
    #получаем длительности звуковых приветствий с точностью до step_duration
    privet_dur = wav_info.get_duration_wav(path_privet_wav, step_duration)
    repeat_answer_dur = wav_info.get_duration_wav(path_repeat_answer_wav,
                                                  step_duration)
    what_city_answer_dur = wav_info.get_duration_wav(path_what_city_answer_wav,
                                                     step_duration)
    bye_dur = wav_info.get_duration_wav(path_bye_wav, step_duration)
    df_citi_all = pd.read_excel(path_cities_all)
    lst_cities = df_citi_all['gor'].values.tolist()
    lst_sub = df_citi_all['sub'].values.tolist()
    del df_citi_all
    lst_cities = [
        re_only_text.sub(' ', re_cyr_only.sub(' ', i)).lower()
        for i in lst_cities
    ]  #оставляем толко кириллицу
    lst_cities = [
        ' '.join([morph.normal_forms(j)[0] for j in i.split()])
        for i in lst_cities
    ]  #приводим города в нормальную форму
    lst_cities = np.unique(lst_cities).tolist()
    lst_sub = [
        re_only_text.sub(' ', re_cyr_only.sub(' ', i)).lower() for i in lst_sub
    ]
    lst_sub = [
        ' '.join([morph.normal_forms(j)[0] for j in i.split()])
        for i in lst_sub
    ]
    lst_sub = np.unique(lst_sub).tolist()
    if flag_verbose:
        print(100 * '#')
        print(
            f'длительность звуковых файлов в секундах с округлением до {step_duration} секунд:'
        )
        print(f'privet.wav = {privet_dur}')
        print(f'repeat_answer.wav = {repeat_answer_dur}')
        print(f'what_city.wav = {what_city_answer_dur}')
        print(f'bye.wav = {bye_dur}')
        print(100 * '#')
        print('список городов:')
        print(', '.join(lst_cities))
    for address, dirs, files in os.walk(folder_record):
        lst_wav = []
        for file_wav in files:
            if file_wav.endswith('wav'):
                lst_wav.append(file_wav)
        sheet_name = address.split('/')[-1]
        if lst_wav:
            lst_wav = sorted(lst_wav)
            lst_wav = [(lst_wav[i], lst_wav[i + 1])
                       for i in range(0, len(lst_wav), 2)]
        else:
            continue

        df_info_wav = pd.DataFrame(columns=[
            'id', 'user', 'number', 'time', 'duration', 'privet_question',
            'privet_answer', 'repeat_question', 'repeat_answer',
            'city_question', 'city_answer', 'bye', 'simultaneously_with_bot',
            'Результат', 'Update', 'Комментарий'
        ])
        df_city_answer = pd.DataFrame(columns=[
            'id', 'user', 'number', 'time', 'kaldi_docker', 'kaldi_local',
            'kaldi_vosk', 'google', 'city_isincluded_kaldi_docker',
            'city_isincluded_kaldi_local', 'city_isincluded_kaldi_vosk',
            'city_isincluded_google'
        ])
        for item in lst_wav:
            temp_dict = {k: '' for k in df_info_wav.columns.tolist()}
            temp_dict_city = {k: '' for k in df_city_answer.columns.tolist()}
            abonent = item[0].split('-')[2][1:]
            temp_dict['id'] = '-'.join(item[0].split('-')[:-1])
            temp_dict['user'] = users[abonent]
            temp_dict['number'] = abonent
            part_data = ''.join(item[0].split('-')[:2])
            date_time_str = part_data[:4] + str('-') + part_data[4:6] + str('-') + part_data[6:8] + str(
                ' ') + part_data[8:10] + \
                            str(':') + part_data[10:12] + str(':') + part_data[12:14]
            date_time_obj = datetime.strptime(date_time_str,
                                              '%Y-%m-%d %H:%M:%S')
            temp_dict['time'] = date_time_obj
            in_wav = item[0]
            out_wav = item[1]
            in_dur = wav_info.get_duration_wav(os.path.join(address, in_wav),
                                               step_duration)
            temp_dict['duration'] = in_dur
            lst_time_say_in = wav_info.get_energy_say(
                os.path.join(address, in_wav), sample_rate, sample_size_bytes,
                step_duration,
                energy_threshold_in)  #время в секундах когда человек говорил
            lst_time_say_out = wav_info.get_energy_say(
                os.path.join(address, out_wav), sample_rate, sample_size_bytes,
                step_duration, energy_threshold_out
            )  #время в секундах когда робот задавал вопрос
            period_say_in = wav_info.find_period_say(
                lst_time_say_in, step_duration
            )  #преобразование времени в интервалы когда говорил человек
            period_say_out = wav_info.find_period_say(
                lst_time_say_out, step_duration
            )  #преобразование времени в интервалы когда говорил робот

            out_dict = {}
            count_none = 0
            for item in period_say_out:
                if (item[1] - item[0]) == privet_dur:
                    out_dict['privet'] = item  #приветствие проиграло полностью
                elif (item[1] - item[0]) == what_city_answer_dur or (
                        item[1] - item[0]
                ) == int(what_city_answer_dur
                         ) + 1:  #вопрос про город проиграл полностью
                    out_dict['what_city'] = item
                elif (item[1] - item[0]) == repeat_answer_dur or (
                        item[1] - item[0]
                ) == int(repeat_answer_dur
                         ) + 1:  #просьба повторить проиграла полностью
                    out_dict['repeat'] = item
                elif (item[1] -
                      item[0]) == bye_dur:  #прощание проиграло полностью
                    out_dict['bye'] = item
                else:
                    out_dict['none_' +
                             str(count_none)] = item  #не распознанный интервал
                    count_none += 1
            del count_none

            out_dict = dict(sorted(
                out_dict.items(),
                key=lambda x: x[1]))  # словарь временных интервалов робота
            count_bytes = int(
                sample_rate * sample_size_bytes *
                step_duration)  # сколько байтов в одном шаге step_duration
            if 'privet' in out_dict.keys():
                temp_dict['privet_question'] = 1
                count_start_answer = out_dict['privet'][
                    1] / step_duration  #время окончания приветствия
                count_stop_answer = out_dict[list(out_dict.keys(
                ))[list(out_dict.keys()).index('privet') + 1]][
                    0] / step_duration  #время начала следующего вопроса за приветствием
                temp_dict['privet_answer'] = rcgn_kaldi_docker(
                    path_wav=os.path.join(address, in_wav),
                    count=count_bytes,
                    count_start_answer=count_start_answer,
                    count_stop_answer=count_stop_answer)
            else:
                temp_dict['privet_question'] = 0

            if 'repeat' in out_dict.keys():
                temp_dict['repeat_question'] = 1
                count_start_answer = out_dict['repeat'][
                    1] / step_duration  #время окончания просьбы повторить ответ
                count_stop_answer = out_dict[list(out_dict.keys())[
                    list(out_dict.keys()).index('repeat') +
                    1]][0] / step_duration  #время начала следующего вопроса
                temp_dict['repeat_answer'] = rcgn_kaldi_docker(
                    path_wav=os.path.join(address, in_wav),
                    count=count_bytes,
                    count_start_answer=count_start_answer,
                    count_stop_answer=count_stop_answer)
            else:
                temp_dict['repeat_question'] = 0

            if 'what_city' in out_dict.keys():
                temp_dict['city_question'] = 1
                '''
                может быть несколько фраз
                count_start_answer - всегда после речи бота 
                count_stop_answer - из  period_say_in
                '''
                count_start_answer = out_dict['what_city'][
                    1] / step_duration  #время окончания вопроса про город
                # count_start_answer = int([i for i in period_say_in if i[0] >= out_dict['what_city'][1]][0][0]/step_duration)
                # [i for i in period_say_in if i[0] >= out_dict['what_city'][1]] - все периоды
                count_stop_answer = out_dict[list(
                    out_dict.keys())[list(out_dict.keys()).index('what_city') +
                                     1]][0] / step_duration
                # count_stop_answer = int([i for i in period_say_in if i[0] >= out_dict['what_city'][1]][0][-1] / step_duration) + 2
                temp_dict['city_answer'] = rcgn_kaldi_docker(
                    path_wav=os.path.join(address, in_wav),
                    count=count_bytes,
                    count_start_answer=count_start_answer,
                    count_stop_answer=count_stop_answer)
                temp_dict_city['id'] = temp_dict['id']
                temp_dict_city['user'] = temp_dict['user']
                temp_dict_city['time'] = temp_dict['time']
                temp_dict_city['kaldi_docker'] = temp_dict['city_answer']
                frame_data = wav_info.get_part_wav_bytes(
                    path_wav=os.path.join(address, in_wav),
                    count=count_bytes,
                    count_start_answer=count_start_answer,
                    count_stop_answer=count_stop_answer)

                temp_dict_city['kaldi_vosk'] = rcgn_kaldi_vosk(
                    frame_data=frame_data, kaldi_rec_vosk=kaldi_rec_vosk)
                temp_dict_city['kaldi_local'] = rcgn_kaldi_local(
                    model_path=model_path_kaldi_local,
                    frame_data=frame_data,
                    sample_rate=sample_rate,
                    sample_size_bytes=sample_size_bytes)

                temp_dict_city['google'] = rcgn_google(
                    frame_data=frame_data,
                    sample_rate=sample_rate,
                    sample_size_bytes=sample_size_bytes)

                ###### city is included #####
                temp_dict_city[
                    'city_isincluded_kaldi_docker'] = check_city_isincluded(
                        temp_dict_city['kaldi_docker'], lst_cities, lst_sub)
                temp_dict_city[
                    'city_isincluded_kaldi_local'] = check_city_isincluded(
                        temp_dict_city['kaldi_local'], lst_cities, lst_sub)
                temp_dict_city[
                    'city_isincluded_kaldi_vosk'] = check_city_isincluded(
                        temp_dict_city['kaldi_vosk'], lst_cities, lst_sub)
                temp_dict_city[
                    'city_isincluded_google'] = check_city_isincluded(
                        temp_dict_city['google'], lst_cities, lst_sub)

            else:
                temp_dict['city_question'] = 0

            if 'bye' in out_dict.keys():
                temp_dict['bye'] = 1
            else:
                temp_dict['bye'] = 0

            period_speak_bot = [
                item_in for item_in in period_say_in
                if any(item_out[0] <= item_in[0] and item_out[1] >= item_in[1]
                       for item_out in period_say_out)
            ]

            if period_speak_bot:
                speak_bot_text = []
                for item in period_speak_bot:
                    count_start_answer = item[0] / step_duration
                    count_stop_answer = (item[1] / step_duration) + 2
                    speak_bot_text.append(
                        rcgn_kaldi_docker(
                            path_wav=os.path.join(address, in_wav),
                            count=count_bytes,
                            count_start_answer=count_start_answer,
                            count_stop_answer=count_stop_answer))
                speak_bot_text = [
                    item for item in speak_bot_text if item != str()
                ]
                if speak_bot_text:
                    temp_dict['simultaneously_with_bot'] = ';'.join(
                        speak_bot_text)
                else:
                    temp_dict['simultaneously_with_bot'] = 1
            else:
                temp_dict['simultaneously_with_bot'] = 0

            df_info_wav = df_info_wav.append(temp_dict, ignore_index=True)
            if temp_dict['city_question'] == 1:
                df_city_answer = df_city_answer.append(temp_dict_city,
                                                       ignore_index=True)

        df_info_wav.to_excel(writer, index=False, sheet_name=str(sheet_name))
        df_city_answer.to_excel(writer,
                                index=False,
                                sheet_name='city_answer' + str(sheet_name))

    writer.save()
    writer.close()
Пример #10
0
from vosk import Model, KaldiRecognizer
from pathlib import Path
import argparse
import wave
import json
from os import remove as del_file
from speach_text.smart_search import find_text_in_models
import speach_text.global_settings as GST

model = Model(GST.voice_model_folder)


class ErrWav(Exception):
    def __init__(self, text):
        self.txt = text


if __name__ == '__main__':
    # 'python demon_voice.py --string demon' - запуск бесконечной обработки

    parser = argparse.ArgumentParser()
    parser.add_argument('--string', type=str, default='', help='')
    opt = parser.parse_args()

    while True:
        all_wav_file = Path(GST.voice_wav_folder).rglob('*.{}'.format(
            GST.wav_extension))
        for file_wav in all_wav_file:

            file_txt = str(file_wav.name).replace(GST.wav_extension,
                                                  GST.text_extension)
Пример #11
0
def main():
    configuration = Configuration("config/config.yaml")

    if not os.path.exists("model/" + configuration.config_list["language"]):
        print(
            "Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as 'model' in the current folder."
        )
        exit(1)

    configuration.generate_nlu_file()

    ##HOTWORD
    hotword = Hotword(configuration.config_list["hotword"])

    ##TEXT TO SPEECH
    tts = Tts()
    tts.setVoice(configuration.config_list["voice_id"])

    ##PYAUDIO
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16,
                    channels=1,
                    rate=16000,
                    input=True,
                    frames_per_buffer=8000)
    stream.start_stream()

    ##VOSK
    model = Model("model/" + configuration.config_list["language"])
    rec = KaldiRecognizer(model, 16000)

    ###SNIPS
    nlu = Nlu("nlu/" + configuration.config_list["language"] + "/dataset.json")

    # Load plugins
    plugin_directories = [os.path.normpath('plugins')]

    plugins_list = PluginList(plugin_directories)
    plugins_list.find_plugins()

    while True:
        data = stream.read(8000, exception_on_overflow=False)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            rec_result = json.loads(rec.Result())
            if rec_result["text"].count(hotword.getWord()) > 0:
                tts.speak(configuration.config_list["sentence_welcome"])
                hotword.setState(True)
            if hotword.getState() == True:
                if rec_result["text"] != "":
                    parsing = nlu.parse(rec_result["text"])
                    if parsing["intent"][
                            "probability"] >= configuration.config_list[
                                "min_probability"]:
                        for plugin in plugins_list._plugins:
                            plugin_object = plugins_list._plugins[
                                plugin].plugin_class
                            if plugin_object.has_intent(
                                    parsing["intent"]["intentName"]) == True:
                                response = plugin_object.get_response(
                                    parsing["intent"]["intentName"],
                                    parsing["slots"])
                                tts.speak(response)
                                hotword.setState(False)
                    elif parsing["intent"]["intentName"] == None:
                        hotword.setState(True)
                    else:
                        tts.speak(
                            "je ne suis pas sur d'avoir compris, peux-tu répéter?"
                        )
Пример #12
0
from ..dirs import MARKUP_TXT
from vosk import Model, KaldiRecognizer, SetLogLevel
import sys
import wave
import json

SetLogLevel(-1)
model = Model("models/kaldi_vosk")


def write_file(data, name):
    with open(f'{MARKUP_TXT}/{name}.txt', 'w', encoding='utf-8') as f:
        for word in data:
            f.write(str([*word.values()])[1:-1] + '\n')


def write_file_text(data, name):
    with open(f'{MARKUP_TXT}/{name}.txt', 'w', encoding='utf-8') as f:
        for word in data:
            f.write(word['word'] + ' ')


def parse_json(data):
    data = json.loads(data)
    for sample in data['result']:
        del sample['conf']
    return data['result']


def creat_text(path):
    wf = wave.open(path, "rb")
    def transcribe_to_sql(self, duration, side, original_file_name, rec_date,
                          src, dst, linkedid):

        trans_start = time.time()  # datetime.datetime.now()

        if self.source_id == self.sources['master']:
            original_file_name = linkedid + ('-in.wav'
                                             if side == 0 else '-out.wav')

        transcribation_date = datetime.datetime.now().strftime(
            '%Y-%m-%dT%H:%M:%S')
        print('transcribing', self.temp_file_path + self.temp_file_name)
        # read file
        wf = wave.open(self.temp_file_path + self.temp_file_name, "rb")

        # read model
        model = Model(self.model_path)
        rec = KaldiRecognizer(model, wf.getframerate())

        # recognizing
        phrases_count = 0

        confidences = []

        while True:

            conf_score = []

            data = wf.readframes(4000)
            if len(data) == 0:
                break

            if rec.AcceptWaveform(data):
                accept = json.loads(rec.Result())
                if accept['text'] != '':

                    accept_start = str(accept['result'][0]['start'])
                    accept_end = accept['result'][-1:][0]['end']
                    accept_text = str(accept['text'])

                    for result_rec in accept['result']:
                        conf_score.append(float(result_rec['conf']))
                    conf_mid = str(sum(conf_score) / len(conf_score))
                    confidences.append(sum(conf_score) / len(conf_score))
                    # conf_score = []

                    self.save_result(duration, accept_text, accept_start,
                                     accept_end, side, transcribation_date,
                                     conf_mid, original_file_name, rec_date,
                                     src, dst, linkedid)

                    phrases_count += 1

        if len(confidences):
            self.confidence_of_file = sum(confidences) / len(confidences)
        else:
            self.confidence_of_file = 0
        trans_end = time.time()  # datetime.datetime.now()
        self.perf_log(2, trans_start, trans_end, duration, linkedid)

        if phrases_count == 0:
            self.save_result(duration, '', '0', '0', side, transcribation_date,
                             0, original_file_name, rec_date, src, dst,
                             linkedid)
Пример #14
0
import ffmpeg
import json
import os
import shutil
import time
from vosk import Model, KaldiRecognizer
import wave

PROJECT_PATH = os.getcwd()
AUDIO_RECORDINGS = PROJECT_PATH + "/audio_examples/"
REPORT_PATH      = PROJECT_PATH + "/VOSK Speech Recognition"
# https://alphacephei.com/vosk/models
MODEL = Model("vosk-model-ru-0.10")
REPORT_LINE_WIDTH = 100

# Variables for the resulting report
number_audio = 0
total_time = 0
total_recognized_words = 0

# Creating a folder with the final transcription report
shutil.rmtree(REPORT_PATH, ignore_errors=True)
os.makedirs(REPORT_PATH)

# Formatting the transcribed text for the report
def recognition_report():
    count_word = 0
    for word in transcript.split():
        count_word += 1
        if (len(word) + recognition_report.length > REPORT_LINE_WIDTH):
            audio_report.write('\n')
Пример #15
0
#!/usr/bin/env python3

from vosk import Model, KaldiRecognizer
import os
import pyaudio

model = Model('model')
rec = KaldiRecognizer(model, 16000)

p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16,
                channels=1,
                rate=16000,
                input=True,
                frames_per_buffer=8000)
stream.start_stream()

while True:
    data = stream.read(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        print(rec.Result())
    else:
        print(rec.PartialResult())

print(rec.FinalResult())
Пример #16
0
def trigger_microphone(n_clicks):
    if n_clicks == 0:
        return ''
    print('trigger microphone %d' % n_clicks)
    import termux
    termux.Microphone.stop()
    pwd = os.environ['PWD']
    aac_file = "%s/microphone.aac" % pwd
    wave_file = "%s/microphone.wave" % pwd
    if os.path.exists(aac_file):
        os.remove(aac_file)
    termux.Microphone.record(aac_file, encoder='aac', limit=5, count=2)
    import time
    time.sleep(6)
    os.system('faad -o %s %s' % (wave_file, aac_file))
    if False:
        import speech_recognition as sr
        r = sr.Recognizer()
        with sr.WavFile(wave_file) as source:
            audio = r.record(source)
        text = r.recognize_sphinx(audio)
    else:
        from vosk import Model, KaldiRecognizer, SetLogLevel
        import wave
        import numpy as np
        model_name = 'vosk-model-small-en-us-0.15'
        if not os.path.exists(model_name):
            os.system('wget http://alphacephei.com/vosk/models/%s.zip' %
                      model_name)
            os.system('unzip %s.zip' % model_name)
        wf = wave.open(wave_file, "rb")
        model = Model(model_name)
        rec = KaldiRecognizer(model, wf.getframerate())
        nch = wf.getnchannels()
        depth = wf.getsampwidth()
        typ = {1: np.uint8, 2: np.uint16, 4: np.uint32}.get(depth)
        sdata = wf.readframes(64000)
        data = np.frombuffer(sdata, dtype=typ)
        ch_data = data[0::nch]
        sdata = ch_data.tobytes()
        if True:
            outwav = wave.open('good.wave', 'w')
            outwav.setparams(wf.getparams())
            outwav.setnchannels(1)
            outwav.writeframes(ch_data.tobytes())
            outwav.close()

        if rec.AcceptWaveform(sdata):
            result = rec.Result()
            result = json.loads(result)
            text = result['text']
        else:
            result = rec.PartialResult()
            result = json.loads(result)
            text = result['partial']
        result = rec.FinalResult()
        result = json.loads(result)
        text += result['text']
    print('finish microphone')
    print('text:%s' % text)
    return text
Пример #17
0
def my_link():

    time.sleep(15)
    c = '0' + '.wav'
    counter = 0
    conn = connect()
    model = Model("vosk-model-small-en-in-0.4")
    pth = os.listdir(audio_path)
    while (c in pth):
        wf = wave.open(audio_path + '/' + c, 'rb')
        rec = KaldiRecognizer(model, wf.getframerate())
        while True:
            data = wf.readframes(CHUNK)
            if len(data) == 0:
                break
            if rec.AcceptWaveform(data):
                pass
        dict = ast.literal_eval(
            rec.FinalResult())  #changing the string to dictionary
        print(c)
        #print(dict["text"])
        s = dict["text"]
        print(s)

        #Text classification starts
        temp = remove_punct(s)
        temp = tknz_text(temp)
        temp = remove_stopwords(temp)
        temp = stmng(temp)
        #for removing punctions
        puncs = set([
            '"', '(', ')', '.', ',', '-', '<', '>', '/', '\',%', '\\x', '!',
            '?', "'", 's'
        ])
        temp2 = []
        for i in temp:
            if i[0].isalpha() == True:
                temp2.append(i)

        #for removing spaces
        temp1 = []
        for i in temp2:
            if i not in ("", '', " ", ' '):
                temp1.append(i)

        print(temp1)
        fg, word = check_list(conn, temp1)
        if fg == 1:
            print("Abusive Detected")
        else:
            print("Normal Text")
        print()

        #close_the_connection(conn)
        time.sleep(0.01)
        counter = counter + 1
        c = '0' + ' ' + '(' + str(counter) + ')' + '.wav'
        pth = os.listdir(audio_path)
        #print(pth, "-->", c)
        time.sleep(5)
    try:
        delete()
    except:
        return render_template('index.html')
    return render_template('index.html')
Пример #18
0
    print(f'Voice model: {config.model}')
    print(f'Full voice model: {config.fullModel}')

    if bool(config.spkModel):
        print(f'Speaker identification model: {config.spkModel}')
    else:
        print('Speaker identification disabled')

    #region Load models
    model = None
    gModel = None
    spkModel = None
    if bool(config.model):
        print()
        print("=========== Загрузка основной голосовой модели ===========")
        model = Model(config.model)
        if model == None:
            fatalError(f'Ошибка при загрузке голосовой модели {config.model}')

    if bool(config.gModel):
        if config.gModel == config.model:
            gModel = model
        else:
            print()
            print("===== Загрузка модели для распознавания со словарем ======")
            gModel = Model(config.gModel)
            if gModel == None:
                fatalError(
                    f'Ошибка при загрузке голосовой модели для распознавания со словарем {config.gModel}'
                )
Пример #19
0
import os
import wave

if not os.path.exists("model-en"):
    print(
        "Please download the model from https://github.com/alphacep/kaldi-android-demo/releases and unpack as 'model' in the current folder."
    )
    exit(1)

wf = wave.open(sys.argv[1], "rb")
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype(
) != "NONE":
    print("Audio file must be WAV format mono PCM.")
    exit(1)

model = Model("model-en")
# You can also specify the possible word list
rec = KaldiRecognizer(model, wf.getframerate(),
                      "zero oh one two three four five six seven eight nine")

while True:
    data = wf.readframes(1000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        print(rec.Result())
    else:
        print(rec.PartialResult())

print(rec.FinalResult())
Пример #20
0
 def __init__(self):
     self.model = Model("model")
Пример #21
0
	#elif 'курс биткоин' in zadanie:
		#response = requests.get("https://api.coinmarketcap.com/v1/ticker/bitcoin/")
		#response_json = response.json()
		#talk(response_json[0]['price_usd'].split('.')[0] + " долларов")
	#elif 'погода питер' in zadanie:
		#res = requests.get("http://api.openweathermap.org/data/2.5/find?q=Petersburg,RU&type=like&APPID=f98abe5235a919f50fc6536fbaa383ca")
		#data = res.json()
		#cities = ["{} ({})".format(d['name'], d['sys']['country'])
		#		for d in data['list']]
		#print( "city:", cities )
	#elif 'имя' in zadanie:
		#talk("Меня зовут Курису.")

if __name__ == "__main__":
	model = Model("models/model-ru")
	rec = KaldiRecognizer(model, 16000)
	
	p = pyaudio.PyAudio()
	stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000)
	stream.start_stream()

	while True:
		data = stream.read(4000)
		if len(data) == 0:
			break
		if rec.AcceptWaveform(data):
			commander_start(rec)
		#else:
			#print(rec.PartialResult())
        }
        color_chosen = switcher.get(text, "grey")
        if color_chosen != "grey":
            self['bg']=color_chosen

def listen_microphone():
    while True:
        data = stream.read(8000, exception_on_overflow = False)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            result = json.loads(rec.Result())
            color_change.change_background_color(result["text"])

if __name__ == "__main__":
    color_change = ColorChange()

    ##PYAUDIO
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000)
    stream.start_stream()

    ##VOSK
    model = Model("model/fr_FR")
    rec = KaldiRecognizer(model, 16000)

    thread = threading.Thread(target=listen_microphone)
    thread.daemon = True 
    thread.start()

    color_change.mainloop()
Пример #23
0
 def __init__(self, model_path, text_processor=None):
     SetLogLevel(-1)
     self.vosk_model = Model(model_path)
     self.text_processor = text_processor
     self.sample_rate = 16000
Пример #24
0
 def __init__(self):
     model = Model("/home/pi/Documents/DOORS/modules/model")
     self.rec = KaldiRecognizer(model, 8000)
Пример #25
0
    if useLM == True:
        from huggingsound import ParlanceLMDecoder
        LmModelFolder = HSSttModelFolder + "/language_model/"
        lm_path = LmModelFolder + "lm.binary"
        unigrams_path = LmModelFolder + "unigrams.txt"
        # To use this decoder you'll need to install the Parlance's ctcdecode first (https://github.com/parlance/ctcdecode)
        print('Starting to load LM file %s in ParlanceLMDecoder ...' % lm_path)
        decoder = ParlanceLMDecoder(HSmodel.token_set, lm_path=lm_path, alpha=2, beta=1, beam_width=100)    
        #decoder = KenshoLMDecoder(model.token_set, lm_path=lm_path, unigrams_path=unigrams_path, alpha=2, beta=1, beam_width=100)
        print("Finished loading Language Model")
except Exception as e:
    print('Could not load acoustic model. Failed with message: %s' % e)
    sys.exit(-1)

print('Loading Vosk Model from folder: %s' % VoskModelFolder)
VoskModel = Model(VoskModelFolder)
rec = KaldiRecognizer(VoskModel, sampleRate)
rec.SetWords(True)

# COMMAND ----------

!ls -R /dbfs/FileStore/output/Spanish/Spanish_Conversational_Speech_Corpus/CTM/
!rm -rf /dbfs/FileStore/output/Spanish/Spanish_Conversational_Speech_Corpus/CTM/
!mkdir /dbfs/FileStore/output/Spanish/Spanish_Conversational_Speech_Corpus/CTM/

# COMMAND ----------

HSOutFolder = outFolder + "/wav2vec2"
VoskOutFolder = outFolder + "/vosk"
if not os.path.exists(HSOutFolder):
    os.mkdir(HSOutFolder)
Пример #26
0
from vosk import Model, KaldiRecognizer

import urllib.request
import uuid

from http.server import HTTPServer, BaseHTTPRequestHandler
from io import BytesIO

MODEL_PATH = "model/"

AUDIO_PATH = "audio/"

SAMPLE_FREQUENCY = 16000

model = Model(MODEL_PATH)


def download_temp_file(url, dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

    filename = os.path.join(dir_name, str(uuid.uuid4()))
    urllib.request.urlretrieve(url, filename=filename)
    return filename


def remove_temp_file(filename):
    if os.path.exists(filename):
        os.remove(filename)
Пример #27
0
import sys
import os
import wave
import json

SetLogLevel(0)

if not os.path.exists("model"):
    print ("Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.")
    exit (1)

wf = wave.open(sys.argv[1], "rb")
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
    print ("Audio file must be WAV format mono PCM.")
    exit (1)

model = Model("model")
rec = KaldiRecognizer(model, wf.getframerate())
rec.SetMaxAlternatives(10)

while True:
    data = wf.readframes(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        print(json.loads(rec.Result()))
    else:
        print(json.loads(rec.PartialResult()))

print(json.loads(rec.FinalResult()))
Пример #28
0
def my_link():
    print("entered into function for processing")

    time.sleep(15)
    c = '0' + '.wav'
    counter = 0
    conn = connect()
    model = Model("vosk-model-small-en-in-0.4")
    pth = os.listdir(audio_path)
    #print(pth)
    print("entering into while loop")
    while (c in pth):
        #sound = AudioSegment.from_wav('C:/Users/admin/Downloads/' +  i )
        #sound = sound.set_channels(1) # To make it MONO Channel
        #sound = sound.set_frame_rate(44100) # Sample Frame rate taken here = 44,100 Hz
        #sound.export('C:/Users/admin/Downloads/' + i , format="wav")

        wf = wave.open(audio_path + '/' + c, 'rb')
        rec = KaldiRecognizer(model, wf.getframerate())
        while True:
            data = wf.readframes(CHUNK)
            if len(data) == 0:
                break
            if rec.AcceptWaveform(data):
                pass

        dict = ast.literal_eval(
            rec.FinalResult())  #changing the string to dictionary

        print(c)
        s = dict["text"]
        print(s)
        #integrate tc model
        temp = remove_punct(s)
        temp = tknz_text(temp)
        temp = remove_stopwords(temp)
        temp = stmng(temp)
        #for removing punctions
        puncs = set([
            '"', '(', ')', '.', ',', '-', '<', '>', '/', '\',%', '\\x', '!',
            '?', "'", 's'
        ])
        temp2 = []
        for i in temp:
            if i[0].isalpha() == True:
                temp2.append(i)

        #for removing spaces
        temp1 = []
        for i in temp2:
            if i not in ("", '', " ", ' '):
                temp1.append(i)

        fg, word = check_list(conn, temp1)
        if fg == 1:
            #flash("Abusive Detected")
            print("Abusive Detected")
        else:
            print("Normal Text")
        print()
        #close_the_connection(conn)

        time.sleep(0.01)
        counter = counter + 1
        c = '0' + ' ' + '(' + str(counter) + ')' + '.wav'
        pth = os.listdir(audio_path)
        time.sleep(5)
    print("exiting while loop")
    delete()
    return redirect('http://127.0.0.1:5000/')
Пример #29
0
import subprocess
import codecs
import datetime

SetLogLevel(-1)

os.chdir(sys.argv[1])

if not os.path.exists(sys.argv[2]):
    print(
        "Please download the model from https://alphacephei.com/vosk/models and unpack as ",
        sys.argv[2], " in the current folder.")
    exit(1)

sample_rate = 16000
model = Model(sys.argv[2])
rec = KaldiRecognizer(model, sample_rate)
rec.SetWords(True)

# zone rendering
if len(sys.argv) > 4 and (float(sys.argv[4]) > 0 or float(sys.argv[5]) > 0):
    process = subprocess.Popen([
        'ffmpeg', '-loglevel', 'quiet', '-i', sys.argv[3], '-ss', sys.argv[4],
        '-t', sys.argv[5], '-ar',
        str(sample_rate), '-ac', '1', '-f', 's16le', '-'
    ],
                               stdout=subprocess.PIPE)
else:
    process = subprocess.Popen([
        'ffmpeg', '-loglevel', 'quiet', '-i', sys.argv[3], '-ar',
        str(sample_rate), '-ac', '1', '-f', 's16le', '-'
Пример #30
0
def detectKeywords(libpath):

    audio_stream = AudiostreamSource()
    extractor = FeatureExtractor(libpath)
    detector = AudioRecognition(libpath)

    framerate = 16000
    model = Model("model")

    #Let's define a custom dictionary
    rec = KaldiRecognizer(
        model, framerate,
        '["oh one two three four five six seven eight nine zero", "[unk]"]')

    extactor_gain = 1.0

    #Add one or more keyword models
    keywordIdAlexa = detector.addModel(
        '../../models/Hotword/alexa_v3.0.35.premium', 0.85)

    bufsize = detector.getInputDataSize()

    print("Audio Recognition Version: " + detector.getVersionString())

    command_started = False

    audio_stream.start()
    try:
        while (True):
            # Wakeword loop
            if (not command_started):
                frame = audio_stream.read(bufsize * 2, bufsize * 2)
                if (not frame):
                    time.sleep(0.01)
                    continue

                features = extractor.signalToMel(frame, extactor_gain)
                prediction = detector.runDetection(features)
                if (prediction != 0):
                    now = datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")
                    if (prediction == keywordIdAlexa):
                        print("Alexa detected:" + now)

                    os.system(play_command + " ../resources/ding.wav")
                    command_started = True
            # vosk loop
            else:
                frame = audio_stream.read(4000, 4000)
                if (not frame):
                    time.sleep(0.01)
                    continue

                if rec.AcceptWaveform(bytes(frame)):
                    print(rec.Result())
                    command_started = False
                    print(rec.FinalResult())

    except KeyboardInterrupt:
        print("Terminating")
        audio_stream.stop()
        sys.exit(0)