Exemplo n.º 1
0
        def send_speech(self, File):
                model_path = get_model_path()
                y, s = librosa.load(File, sr=16000)
                sf.write(File, y, s)
                config = {
                        'lm': False,
                        'audio_file': File,
                        'hmm': constants.POCKET_SPHINX_MODEL_FILEPATH,
                        'dict': os.path.join(model_path, 'cmudict-en-us.dict')
                }

                yes_result = 0
                no_result = 0
                audio = AudioFile(kws=constants.YES_WORDS_FILEPATH, **config)
                for phrase in audio:
                        yes_result += 1

                audio = AudioFile(kws=constants.NO_WORDS_FILEPATH, **config)
                for phrase in audio:
                        no_result += 1
                os.remove(File)

                if yes_result == 0 and no_result == 0:
                        result = "unsure"

                if yes_result > no_result:
                        result = "yes"
                else:
                        result = "no"

                sys.stdout.write('SET VARIABLE GoogleUtterance "%s"\n' % str(result))
                sys.stdout.flush()
                sys.stdout.write("EXEC " + "\"" + "NOOP" + "\" \"" "%s \n" % str(result))
                sys.stdout.flush()
Exemplo n.º 2
0
    def extract_keywords(self,
                         file_name,
                         sample_rate=16000,
                         window_ms=1000,
                         hop_ms=500):

        kws_results = []

        files = [file_name]
        for fname, transcription in zip(
                files, quartznet.transcribe(paths2audio_files=files)):
            print(
                f"[NeMo] Audio in {fname} was recognized as: {transcription}")

        self.kws_config['audio_file'] = file_name

        audio = AudioFile(audio_file=file_name)
        print(f"Printing all audio segments in {file_name}")
        for phrase in audio:
            for s in phrase.seg():
                print(s.start_frame, s.end_frame, s.word)
                print(
                    transcribe(file_name, s.start_frame * 160,
                               s.end_frame * 160))
        print("Done printing segments")
        audio = AudioFile(**self.kws_config)

        for phrase in audio:
            result = phrase.segments(detailed=True)

            # TODO:: confirm that when multiple keywords are detected, every detection is valid
            if len(result) == 1:
                start_time = result[0][2] * 10
                end_time = result[0][3] * 10
                # print('%4sms ~ %4sms' % (start_time, end_time))
                text = transcribe(file_name, start_time * 16, end_time * 16)
                if self.keyword not in text.lower():
                    continue

                print("Pruning")
                while not good_start(text,
                                     self.keyword) and start_time < end_time:
                    start_time += 100
                    text = transcribe(file_name, start_time * 16,
                                      end_time * 16)

                while not good_end(text,
                                   self.keyword) and start_time < end_time:
                    end_time -= 100
                    text = transcribe(file_name, start_time * 16,
                                      end_time * 16)

                if text == self.keyword:
                    print("MATCH", file_name)
                    kws_results.append((start_time, end_time))

        return kws_results
Exemplo n.º 3
0
    def get_subtitles(self, input_file_path,
                      output_dir) -> SubtitleStageResult:
        phrases = []
        log = ""

        with contextlib.closing(wave.open(input_file_path, 'r')) as f:
            rate = f.getframerate()
            frames = f.getnframes()
            duration = frames / float(rate)
            log += f"rate: {rate}, frames: {frames}, duration: {duration}"

        for phrase in AudioFile(audio_file=input_file_path):
            start = sys.maxsize
            end = 0
            for seg in phrase.seg():
                start = min(start, seg.start_frame)
                end = max(end, seg.end_frame)

            log += f"{phrase}: {start}~{end}\n"

            phrases.append(
                Subtitle("", str(phrase), start / float(rate),
                         end / float(rate)))

        result = SubtitleStageResult()
        result.success = True
        result.log = log
        result.subtitles = phrases
        return result
Exemplo n.º 4
0
    def extract_keywords(self,
                         file_name,
                         sample_rate=16000,
                         window_ms=1000,
                         hop_ms=500):

        kws_results = []

        self.kws_config['audio_file'] = file_name
        audio = AudioFile(**self.kws_config)

        for phrase in audio:
            result = phrase.segments(detailed=True)
            if len(result) == 0:
                continue
            start_time = result[0][2] * 10
            end_time = result[0][3] * 10
            # print('%4sms ~ %4sms' % (start_time, end_time))

            if len(result) > 1:
                print(result)
                raise ValueError('Result has more than one entry')

            kws_results.append((start_time, end_time))

        return kws_results
Exemplo n.º 5
0
 def test_kws(self):
     segments = []
     for phrase in AudioFile(lm=False,
                             keyphrase='forward',
                             kws_threshold=1e+20):
         segments = phrase.segments(detailed=True)
     self.assertEqual(segments, [('forward', -617, 63, 121)])
Exemplo n.º 6
0
def get_words_from_file(file_path):
    """
    :param file_path: audio file (must be raw 16khz 16bit)
    :return: a list of phrases made of words
    """

    model_path = get_model_path()
    data_path = get_data_path()

    config = {
        'verbose': False,
        'audio_file': file_path,
        'buffer_size': 2048,
        'no_search': False,
        'full_utt': False,
        'hmm': os.path.join(model_path, 'en-us'),
        'lm': os.path.join(model_path, 'en-us.lm.bin'),
        'dict': os.path.join(model_path, 'cmudict-en-us.dict')
    }

    audio = AudioFile(**config)

    phrases = []

    for phrase in audio:
        phrases.append(str(phrase))

    return phrases
Exemplo n.º 7
0
def get_phonemes_from_file(file_path):
    """
    :param file_path: audio file (must be raw 16khz 16bit)
    :return: a list of phrases made of phonemes
    """

    model_path = get_model_path()
    data_path = get_data_path()

    config = {
        'verbose': False,
        'audio_file': file_path,
        'buffer_size': 2048,
        'no_search': False,
        'full_utt': False,
        'hmm': os.path.join(model_path, 'en-us'),
        'allphone': os.path.join(model_path, 'en-us/en-us-phone.lm.dmp'),
        'beam': 1e-20,
        'pbeam': 1e-20,
        'lw': 2.0
    }

    audio = AudioFile(**config)

    phrases = []

    for phrase in audio:
        phrases.append(str(phrase))

    return phrases
Exemplo n.º 8
0
 def transcribe(self, audio_file: Path):
     self.config['audio_file'] = audio_file
     transcription = ''
     for phase in AudioFile(**self.config):
         transcription = str(phase)
         break
     return transcription
Exemplo n.º 9
0
def decode_with_time_stamp(config):
    # with time axis
    fps = config['frate']
    for phrase in AudioFile(**config):
        print('-' * 28)
        print('| %5s |  %3s  |   %5s   |' % ('start', 'end', 'word'))
        print('-' * 29)
        for s in phrase.seg():
            print('| %4ss | %4ss | %9s |' %
                  (s.start_frame / fps, s.end_frame / fps, s.word))
        print('-' * 29)
Exemplo n.º 10
0
def keyword_spotting(config):
    # keyword spotting
    if not config.get('keyphrase'):
        raise ValueError('no keyphrase given for spotting')
    fps = config['frate']
    config['lm'] = False
    config['kws_threshold'] = 1e-20
    audio = AudioFile(**config)
    for phrase in audio:
        for s in phrase.seg():
            print(s.start_frame / fps, s.end_frame / fps, s.word)
Exemplo n.º 11
0
def keyword_list_spotting(config):
    # uses a file for inputting the keywords
    if not config.get('kws'):
        raise ValueError('no keywords file given for spotting')
    if not os.path.isfile(config['kws']):
        raise IOError('keywords file does not exist %s' % config['kws'])
    fps = config['frate']
    config['lm'] = False
    audio = AudioFile(**config)
    for phrase in audio:
        for s in phrase.seg():
            print(s.start_frame / fps, s.end_frame / fps, s.word, s.prob)
Exemplo n.º 12
0
def grammar_search(config):
    # search via jsgf queries
    # for Java Speech grammar format check https://www.w3.org/TR/jsgf/
    if not config.get('jsgf'):
        raise ValueError('no jsgf file given for grammar search')
    if not os.path.isfile(config['jsgf']):
        raise IOError('grammar file does not exist %s' % config['jsgf'])
    fps = config['frate']
    config['lm'] = False
    config['jsgf'] = 'n.gram'
    config['keyphrase'] = None
    audio = AudioFile(**config)
    for phrase in audio:
        for s in phrase.seg():
            print(s.start_frame / fps, s.end_frame / fps, s.word)
Exemplo n.º 13
0
def test(update):
    downloadVoiceFile(update)

    model_path = get_model_path()
    config = {
        'audio_file': os.path.join("files", LAST_VOICE_FILE),
        'verbose': False,
        'buffer_size': 2048,
        'no_search': False,
        'full_utt': False,
        'hmm': os.path.join(model_path, 'en-us'),
        'lm': os.path.join(model_path, 'en-us.lm.bin'),
        'dict': os.path.join(model_path, 'cmudict-en-us.dict')
    }

    for phrase in AudioFile(**config):
        print(phrase)
Exemplo n.º 14
0
def phone():
    config = {
        'verbose' : False,
        'logfn' : '/dev/null' or 'nul',
        'audio_file' : 'tovmok.wav',
        'audio_device' : None,
        'sampling_rate' : 16000,
        'buffer_size' : 2048,
        'no_search' : False,
        'full_utt' : False,
        'hmm' : 'ASRProject/model_parameters/iot.ci_cont',
        'lm' : 'ASRProject/etc/iot.lm.DMP',
        'dict' : 'ASRProject/etc/iot.dic',
    }

    audio = AudioFile(**config)
    for phrase in audio:
        print(phrase)
Exemplo n.º 15
0
    def detect(self, file_name):

        kws_results = []

        self.kws_config['audio_file'] = file_name
        audio = AudioFile(**self.kws_config)

        for phrase in audio:
            result = phrase.segments(detailed=True)

            # TODO:: confirm that when multiple keywords are detected, every detection is valid
            if len(result) == 1:
                start_time = result[0][2] * 10
                end_time = result[0][3] * 10
                if self.verbose:
                    print('%4sms ~ %4sms' % (start_time, end_time))
                kws_results.append((start_time, end_time))

        return kws_results
Exemplo n.º 16
0
def transcribe(myword):
    dic = {}
    fps = 100
    stre = ""
    print(os.getcwd() + "/audios")
    for file in os.listdir(os.getcwd() + "/audios"):
        if file.endswith("wav"):
            afile = (file.split(".")[0]) + ".mp4"
            dic[(afile)] = []
            print(file)
            for phrase in AudioFile(audio_file=(os.getcwd() + "/audios/" +
                                                file),
                                    full_utt=False):
                for s in phrase.seg():
                    #if '<' not in s.word:
                    print('| %4ss | %4ss | %8s |' %
                          (s.start_frame / fps, s.end_frame / fps, s.word))
                    if myword in s.word:
                        occ = [s.start_frame / fps, s.end_frame / fps]
                        dic[afile].append(occ)
    return dic
Exemplo n.º 17
0
    def get_sphinx_text(self, audio_file, sample_rate, lang):

        text = ''
        try:

            config = {
                'verbose': False,
                'audio_file': audio_file,
                'buffer_size': 2048,
                'no_search': False,
                'full_utt': False,
                'hmm': os.path.join(self.model_path, 'en-us'),
                'lm': os.path.join(self.model_path, 'en-us.lm.bin'),
                'dict': os.path.join(self.model_path, 'cmudict-en-us.dict')
            }
            config_in = {
                'verbose': False,
                'audio_file': audio_file,
                'buffer_size': 2048,
                'no_search': False,
                'full_utt': False,
                'hmm': os.path.join(self.model_path, 'en_in'),
                'lm': os.path.join(self.model_path, 'en-in.lm.bin'),
                'dict': os.path.join(self.model_path, 'en-in.dict')
            }

            print(config_in)
            print(config)
            audio = AudioFile(**config)

            for phrase in audio:
                text += str(phrase)
                print('CMU text:' + text)

            del audio
        except Exception as e:
            print('Error processing sphinx:', str(e))

        print('processed sphinx STT')
        return text
Exemplo n.º 18
0
def up_ps_audio(wavfile):
    """Pocketsphinx库本地离线唤醒"""
    model_path = get_model_path()

    # lm文件和dict文件替换,参考https://blog.51cto.com/feature09/2300352
    # 所需资源文件在yiwa/asr/resources/中,必需放入到你的/site-packages/pocketsphinx/model/目录下
    config = {
        'verbose': False,
        'audio_file': wavfile,
        'buffer_size': 2048,
        'no_search': False,
        'full_utt': False,
        'hmm': os.path.join(model_path, 'zh_cn'),
        'lm': os.path.join(model_path, '3603.lm'),
        'dict': os.path.join(model_path, '3603.dic')
    }

    # 识别声音文件,参考https://pypi.org/project/pocketsphinx/
    audio = AudioFile(**config)
    for phrase in audio:
        return phrase
    return None
Exemplo n.º 19
0
    def extract_keywords(self,
                         file_name,
                         sample_rate=16000,
                         window_ms=1000,
                         hop_ms=500):

        kws_results = []

        self.kws_config['audio_file'] = file_name
        audio = AudioFile(**self.kws_config)

        for phrase in audio:
            result = phrase.segments(detailed=True)

            # TODO:: confirm that when multiple keywords are detected, every detection is valid
            if len(result) == 1:
                start_time = result[0][2] * 10
                end_time = result[0][3] * 10
                # print('%4sms ~ %4sms' % (start_time, end_time))

                kws_results.append((start_time, end_time))

        return kws_results
def processAudio(file):
    audio = AudioFile(audio_file=file, buffer_size=1024)
    for phrase in audio:
        print(phrase)
Exemplo n.º 21
0
import os
from pocketsphinx import AudioFile

audio = AudioFile(
    audio_file=
    r'C:\Users\BZT\Desktop\speech_segment\speech_segment\Ses01F_impro01_M013.wav',
    keyphrase='yeah')
fps = 100
for phrase in audio:  # frate (default=100)
    # print('-' * 28)
    # print('| %5s |  %3s  |   %4s   |' % ('start', 'end', 'word'))
    # print('-' * 28)
    for s in phrase.seg():
        print('%4ss\t%4ss\t%8s' %
              (s.start_frame / fps, s.end_frame / fps, s.word))
    # print('-' * 28)

# from pocketsphinx import Pocketsphinx
#
# ps = Pocketsphinx(verbose=True, logfn='pocketsphinx.log')
# ps.decode()
#
# print(ps.hypothesis())
Exemplo n.º 22
0
badwords = open("badwords.txt")
open("words.csv", "w").close()
f = open('words.csv', 'r+')
writer = csv.writer(f)
writer.writerow(['start', 'end', 'word'])
data = list(badwords)
r = []
fps = 100
config = {
    'verbose': False,
    'buffer_size': 2048,
    'audio_file': os.path.join(data_path, aF),
    'frate': fps,
    'no_search': False
}
audio = AudioFile(**config)
for phrase in audio:
    print(phrase)
    for s in phrase.seg():
        writer.writerow([s.start_frame / fps, s.end_frame / fps, s.word]) # writing timestamps of words to csv
mycsv = csv.reader(f)
for i in badwords:
    for row in mycsv:
        if row[2] == i:
            writer.writerow(row)


# Code Wasteland
# I leave code here and come back to it if I need to
# print('| %4ss | %4ss | %8s |' % (s.start_frame / fps, s.end_frame / fps, s.word))
# writer.writerow(['{0}'.format(s.start_frame),'{0}'.format(s.end_frame),'{0}'.format(s.word)]) ### this line of code is so long it doesnt fit on one line in brackets, generally it was a bad idea, i fixed it tho and it shall lay here in the code wasteland
Exemplo n.º 23
0
def simple_decode(config):
    # simple decode
    audio = AudioFile(**config)
    for phrase in audio:
        print(phrase)
Exemplo n.º 24
0
from pocketsphinx import AudioFile

audio = AudioFile(lm=False, keyphrase='forward', kws_threshold=1e+20)
for phrase in audio:
    print(phrase.segments(detailed=True))  # => "[('forward', -617, 63, 121)]"
Exemplo n.º 25
0
        remtime = time.strftime('%H:%M:%S', time.gmtime(x))
        res = remtime + ',' + mil
        return res


f = open('sub.srt', 'w')
f.close()

fps = 100
counter = 1
z = 0
h = ''
start = ''
end = ''
f = open('sub.srt', 'a')
for phrase in AudioFile(audio_file='audio_filtered_final.wav',
                        frate=fps):  # frate (default=100)

    for s in phrase.segments(detailed=True):

        if z == 0 and s[0] != '<s>' and s[0] != '</s>':
            start = timeconv(s[2] / fps)
            z = 1
        if s[0] != '</s>' and s[0] != '<s>':
            end = timeconv(s[3] / fps)
        if s[0] != '</s>' and s[0] != '<s>' and s[0] != '<sil>':
            if s[0] != '[SPEECH]':
                if s[0].find('(') < 0:
                    h = h + s[0]
                else:
                    h = h + (s[0])[:s[0].find('(')]
            else:
Exemplo n.º 26
0
 def test_audiofile(self):
     hypothesis = ''
     for phrase in AudioFile():
         hypothesis = str(phrase)
     self.assertEqual(hypothesis, 'go forward ten meters')
import os, time
from pocketsphinx import AudioFile, get_model_path

model_path = get_model_path()
exmpl_path = os.getcwd()
exmpl_path = os.path.join(exmpl_path, 'examples')

start = time.process_time()
print('start of init')
speech = AudioFile(
    verbose=False,
    audio_file=os.path.join(exmpl_path, 'coming_home_red_16000.raw'),
    buffer_size=2048,
    no_search=False,
    full_utt=False,
    hmm=os.path.join(model_path, 'zero_ru.cd_cont_4000'),
    lm=os.path.join(model_path, 'ru.lm'),
    dic=os.path.join(model_path, 'my_dict.dic')
    # dic=os.path.join(model_path, 'ru.dic')
)
stop = time.process_time()
print('time of init - ' + str(stop - start))

#digits_16000
start = time.process_time()
for _ in speech:
    pass
stop = time.process_time()
print('time of recognizing - ' + str(stop - start))
print(str(speech))
    def listenRoutine(self):
        r = sr.Recognizer()

        for each_file in files:
            print(each_file)
            model_path = get_model_path()
            data_path = get_data_path()

            config = {
                'verbose': False,
                'audio_file': os.path.join(os.getcwd(), 'audioFiles',
                                           each_file),
                'buffer_size': 2048,
                'no_search': False,
                'full_utt': False,
                'hmm': os.path.join(model_path, 'en-us'),
                'lm': os.path.join(os.getcwd(), "TAR9991/TAR9991/9991.lm"),
                'dict': os.path.join(os.getcwd(), "TAR9991/TAR9991/9991.dic")
            }
            #print (config)

            audio = AudioFile(**config)
            for phrase in audio:
                print(phrase)

            with sr.AudioFile(
                    os.path.join(os.getcwd(), "audioFiles",
                                 each_file)) as source2:
                recording = r.record(source2)
                print(
                    r.recognize_google(recording,
                                       language="en-EN",
                                       show_all=True))

        exit()
        if 0:
            eFile = sr.AudioFile(each_file)
            with eFile as source:
                audio = r.record(source)
                print(each_file, type(audio))
                print(
                    r.recognize_google(audio, language="en-EN", show_all=True))

                #print(r.recognize_sphinx(audio, grammar="TAR9991/TAR9991/"))
        exit()

        print("\r\n\r\n*****\r\nr", r)
        list_text = [
            'a lumpy', 'hey Lumpy', 'lamp', 'Halen', 'Hayden', 'listen',
            'Listen', 'Lampe', 'lampe'
        ]
        stop_flag = True
        duration = 5
        while (stop_flag):
            config = {
                'color': {
                    'hue': self.hue,
                    'saturation': self.saturation
                },
                'brightness': self.brightness,
                'on': self.on_off,
                'client': 'local'
            }

            print("    - mqtt saved:", config)

            our_device = getaudiodevices()
            print("Detected our mic:", our_device)
            with sr.Microphone(device_index=our_device,
                               sample_rate=48000) as source:
                print("Microphone source:", source, source.__dict__.keys(),
                      source.device_index)
                print(" - Call lampi (", duration, "seconds ) ...")
                print("Set minimum energy threshold to {}".format(
                    r.energy_threshold))
                r.adjust_for_ambient_noise(source)
                audio_data = r.record(source, duration=duration)
                #print(type(audio_data))
                filename = "pre_filtered_" + datetime.now().strftime(
                    "%H:%M:%S") + ".wav"
                with open(filename, "wb") as audio_file:
                    audio_file.write(audio_data.get_wav_data())
                exit()
                #print(" - Recognizing...")
                # convert speech to text
                #text = r.recognize_google(audio_data)
                try:
                    text = r.recognize_google(audio_data, language="en-EN")
                    print(" - heard: ", text)
                    text = text.split(" ")
                    for item in text:
                        #print(list_text[i])
                        if item in list_text:
                            print(" - LAMPI detected")

                            pygame.init()
                            pygame.mixer.music.load('this_is_lampi.mp3')
                            pygame.mixer.music.play()
                            time.sleep(3)
                            pygame.mixer.music.fadeout(5)

                            #stop_flag = False
                            self.commandRoutine()
                            break
                except:
                    print(" - no word recognized!")
Exemplo n.º 29
0
    def fsg_search(self,
                   text_snippet,
                   audio_snippet,
                   offset_seconds,
                   operation='beginning',
                   option='safe'):
        # create grammar file for the fsg search
        fsg_file = self.generate_fsg(text_snippet, operation)

        # store the name of the file which stores the search results
        fsg_result_file = fsg_file.replace('.jsgf', '.yaml')
        self.fsg_result_files.append(fsg_result_file)

        CONFIG['jsgf'] = fsg_file
        CONFIG['audio_file'] = audio_snippet
        audio = AudioFile(**CONFIG)
        result_sequence = []
        for phrase in audio:
            for s in phrase.seg():
                start_time = s.start_frame / CONFIG['frate']
                end_time = s.end_frame / CONFIG['frate']
                if start_time != end_time and s.word != '<sil>':
                    # getting rid if NULL elements and silences
                    result_sequence.append((start_time, end_time, s.word))
        with open(fsg_result_file, 'w') as out:
            yaml.dump(result_sequence, out)
        self.remove_file(fsg_file)
        self.remove_file(audio_snippet)
        # should return the best match text snippet with beginning end
        if operation == 'beginning':
            search_snippet = copy(text_snippet)
            match_result, search_snippet_ind = self.find_match(
                result_sequence, search_snippet)
            # assert that offset_seconds is zero
            if match_result:
                result_seconds = offset_seconds + match_result[0]
                search_snippet = search_snippet[search_snippet_ind[0]:\
                                                search_snippet_ind[1]]
                self.beginning_word_index = search_snippet_ind[0]
            else:
                if option == 'safe':
                    result_seconds, search_snippet = None, []
                    self.beginning_word_index = None
                else:
                    result_seconds = offset_seconds
                    search_snippet = text_snippet
                    self.beginning_word_index = 0
        elif operation == 'ending':
            search_snippet = copy(text_snippet)[::-1]
            match_result, search_snippet_ind = self.find_match(
                result_sequence[::-1], search_snippet)
            if match_result:
                result_seconds = offset_seconds + match_result[1]
                search_snippet = search_snippet[search_snippet_ind[0]:\
                                                search_snippet_ind[1]][::-1]
                if search_snippet_ind[0] == 0:
                    self.ending_word_index = None
                else:
                    self.ending_word_index = -1 * search_snippet_ind[0]
            else:
                if option == 'safe':
                    result_seconds, search_snippet = None, []
                    self.ending_word_index = 0
                else:
                    # get result second from the audio_snippet filename
                    m = re.search('.+_\d+\.\d+_(\d+\.\d+).wav', audio_snippet)
                    if m:
                        result_seconds = float(m.groups()[0])
                    else:
                        result_seconds = end_time  # input total duration
                        self.ending_word_index = None
                    search_snippet = text_snippet
        else:
            raise ValueError('option %s not known' % option)
        return result_seconds, search_snippet
from pocketsphinx import AudioFile
for phrase in AudioFile():
    print(phrase)