Пример #1
0
def main():
    parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    parser.add_argument('--model', required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--alphabet', required=True,
                        help='Path to the configuration file specifying the alphabet used by the network')
    parser.add_argument('--lm', nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument('--trie', nargs='?',
                        help='Path to the language model trie file created with native_client/generate_trie')
    parser.add_argument('--audio', required=True,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--version', action=VersionAction,
                        help='Print version and exits')
    parser.add_argument('--extended', required=False, action='store_true',
                        help='Output string from extended metadata')
    args = parser.parse_args()

    print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_ALPHA, LM_BETA)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)

    fin = wave.open(args.audio, 'rb')
    fs = fin.getframerate()
    if fs != 16000:
        print('Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'.format(fs), file=sys.stderr)
        fs, audio = convert_samplerate(args.audio)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/16000)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    if args.extended:
        print(metadata_to_string(ds.sttWithMetadata(audio, fs)))
    else:
        print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
Пример #2
0
def load_model(models, alphabet, lm, trie):
    N_FEATURES = 26
    N_CONTEXT = 9
    BEAM_WIDTH = 500
    LM_ALPHA = 0.75
    LM_BETA = 1.85

    model_load_start = timer()
    ds = Model(models, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    logging.debug("Loaded model in %0.3fs." % (model_load_end))

    lm_load_start = timer()
    ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA)
    lm_load_end = timer() - lm_load_start
    logging.debug('Loaded language model in %0.3fs.' % (lm_load_end))

    return [ds, model_load_end, lm_load_end]
Пример #3
0
def tflite_worker(model, alphabet, lm, trie, queue_in, queue_out, gpu_mask):
    os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask)
    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA)

    while True:
        msg = queue_in.get()

        fin = wave.open(msg['filename'], 'rb')
        fs = fin.getframerate()
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
        audio_length = fin.getnframes() * (1/16000)
        fin.close()
    
        decoded = ds.stt(audio, fs)
        
        queue_out.put({'prediction': decoded, 'ground_truth': msg['transcript']})
        queue_in.task_done()
class Tester(BaseTester):

    name = 'DeepSpeech'

    audio_format = RATE16K_MONO_WAV

    def __init__(self, *args, **kwargs):
        super(Tester, self).__init__(*args, **kwargs)

        files = [args_lm, args_trie, args_model, args_alphabet]
        for f in files:
            assert os.path.isfile(f)

        print('Loading model from file %s' % (args_model), file=sys.stderr)
        model_load_start = timer()
        self.ds = Model(args_model, N_FEATURES, N_CONTEXT, args_alphabet, BEAM_WIDTH)
        model_load_end = timer() - model_load_start
        print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

        if args_lm and args_trie:
            print('Loading language model from files %s %s' % (args_lm, args_trie), file=sys.stderr)
            lm_load_start = timer()
            self.ds.enableDecoderWithLM(args_alphabet, args_lm, args_trie, LM_ALPHA, LM_BETA)
            lm_load_end = timer() - lm_load_start
            print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)

    def audio_to_text(self, fn):
        fin = wave.open(fn, 'rb')
        fs = fin.getframerate()
        assert fs == 16000, "Only 16000Hz input WAV files are supported for now!"
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
        audio_length = fin.getnframes() * (1/16000)
        fin.close()

        print('Running inference.', file=sys.stderr)
        inference_start = timer()
        text = self.ds.stt(audio, fs)
        print('text:', text)
        inference_end = timer() - inference_start
        print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
        return text
    def __init__(self, *args, **kwargs):
        super(Tester, self).__init__(*args, **kwargs)

        files = [args_lm, args_trie, args_model, args_alphabet]
        for f in files:
            assert os.path.isfile(f)

        print('Loading model from file %s' % (args_model), file=sys.stderr)
        model_load_start = timer()
        self.ds = Model(args_model, N_FEATURES, N_CONTEXT, args_alphabet, BEAM_WIDTH)
        model_load_end = timer() - model_load_start
        print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)

        if args_lm and args_trie:
            print('Loading language model from files %s %s' % (args_lm, args_trie), file=sys.stderr)
            lm_load_start = timer()
            self.ds.enableDecoderWithLM(args_alphabet, args_lm, args_trie, LM_ALPHA, LM_BETA)
            lm_load_end = timer() - lm_load_start
            print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)
Пример #6
0
# The audio file we want to transcribe
AUDIO_FILE = "test.wav"

# Convert the model files to absolute paths (required by DeepSpeech)
speech_model_path = str(CURRENT_FOLDER / SPEECH_MODEL)
language_model_path = str(CURRENT_FOLDER / LANGUAGE_MODEL)
language_model_trie_path = str(CURRENT_FOLDER / LANGUAGE_MODEL_TRIE)
alphabet_config_path = str(CURRENT_FOLDER / ALPHABET_CONFIG)
audio_file_path = str(CURRENT_FOLDER / AUDIO_FILE)

# Load the pre-trained speech model
deepspeech_model = DeepSpeechModel(
    speech_model_path,
    NUM_MFCC_FEATURES,
    NUM_SAMPLES_PER_WINDOW,
    alphabet_config_path,
    CTC_BEAM_WIDTH
)

# Load the pre-trained language model
deepspeech_model.enableDecoderWithLM(
    alphabet_config_path,
    language_model_path,
    language_model_trie_path,
    LANGUAGE_MODEL_WEIGHT,
    LANGUAGE_MODEL_WORD_INS_BONUS,
)

# Load audio file using the wave library
with wave.open(audio_file_path, 'rb') as input_wave_file:
Пример #7
0
class Speech2Text:
    def __init__(self,
                 model_path='..\\models\\frozen_graphs\\output_graph.pbmm',
                 beam_width=500):

        self.model_path = model_path
        self.beam_width = beam_width

        print('Loading model from file {}'.format(model_path), file=sys.stderr)
        model_load_start = timer()
        self.speech_recog_engine = Model(model_path, beam_width)
        model_load_end = timer() - model_load_start
        print('Loaded model in {:.3}s.'.format(model_load_end),
              file=sys.stderr)

        self.desired_sample_rate = self.speech_recog_engine.sampleRate()

    @staticmethod
    def convert_samplerate(audio_path, desired_sample_rate):
        sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(
            quote(audio_path), desired_sample_rate)
        try:
            output = subprocess.check_output(shlex.split(sox_cmd),
                                             stderr=subprocess.PIPE)
        except subprocess.CalledProcessError as e:
            raise RuntimeError('SoX returned non-zero status: {}'.format(
                e.stderr))
        except OSError as e:
            raise OSError(
                e.errno,
                'SoX not found, use {}hz files or install it: {}'.format(
                    desired_sample_rate, e.strerror))
        return desired_sample_rate, np.frombuffer(output, np.int16)

    @staticmethod
    def metadata_to_string(metadata):
        return ''.join(item.character for item in metadata.items)

    def words_from_metadata(self, metadata):
        word = ""
        word_list = []
        word_start_time = 0
        # Loop through each character
        for i in range(0, metadata.num_items):
            item = metadata.items[i]
            # Append character to word if it's not a space
            if item.character != " ":
                word = word + item.character
            # Word boundary is either a space or the last character in the array
            if item.character == " " or i == metadata.num_items - 1:
                word_duration = item.start_time - word_start_time

                if word_duration < 0:
                    word_duration = 0

                each_word = dict()
                each_word["word"] = word
                each_word["start_time "] = round(word_start_time, 4)
                each_word["duration"] = round(word_duration, 4)

                word_list.append(each_word)
                # Reset
                word = ""
                word_start_time = 0
            else:
                if len(word) == 1:
                    # Log the start time of the new word
                    word_start_time = item.start_time

        return word_list

    def load_audio_file(self, wav_audio_path):

        fin = wave.open(wav_audio_path, 'rb')
        frame_rate = fin.getframerate()

        if frame_rate != self.desired_sample_rate:
            print(
                'Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'
                .format(frame_rate, self.desired_sample_rate),
                file=sys.stderr)
            fs, audio = Speech2Text.convert_samplerate(
                wav_audio_path, self.desired_sample_rate)
        else:
            audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

        nframes = fin.getnframes()
        fin.close()

        return audio, nframes, frame_rate

    @staticmethod
    def load_audio_file_static(wav_audio_path, desired_sample_rate):

        fin = wave.open(wav_audio_path, 'rb')
        frame_rate = fin.getframerate()

        if frame_rate != desired_sample_rate:
            print(
                'Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'
                .format(frame_rate, desired_sample_rate),
                file=sys.stderr)
            fs, audio = Speech2Text.convert_samplerate(wav_audio_path,
                                                       desired_sample_rate)
        else:
            audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

        nframes = fin.getnframes()
        fin.close()

        return audio, nframes, frame_rate

    def convert_to_text(self, audio, nframes, frame_rate):

        audio_length = nframes * (1 / frame_rate)

        print('Running inference.', file=sys.stderr)
        inference_start = timer()

        text = self.speech_recog_engine.stt(audio)

        inference_end = timer() - inference_start
        print('Inference took %0.3fs for %0.3fs audio file.' %
              (inference_end, audio_length),
              file=sys.stderr)

        return text
Пример #8
0
def main():
    "Launch point"

    parser = argparse.ArgumentParser()
    parser.add_argument('--youtube-id',
                        action="store",
                        help="Provide youtube video ID")
    parser.add_argument('--model',
                        required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument(
        '--alphabet',
        required=True,
        help=
        'Path to the configuration file specifying the alphabet used by the network'
    )
    parser.add_argument('--lm',
                        nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument(
        '--trie',
        nargs='?',
        help=
        'Path to the language model trie file created with native_client/generate_trie'
    )
    parser.add_argument('--crop-time',
                        type=int,
                        help='You could process only n seconds.')
    args = parser.parse_args()
    file_name = download(args.youtube_id, crop_time=args.crop_time)
    print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    deepspeech = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet,
                       BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)
    if args.lm and args.trie:
        print('Loading language model from files {} {}'.format(
            args.lm, args.trie),
              file=sys.stderr)
        lm_load_start = timer()
        deepspeech.enableDecoderWithLM(args.alphabet, args.lm, args.trie,
                                       LM_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end),
              file=sys.stderr)

    fin = wave.open(file_name, 'rb')
    framerate_sample = fin.getframerate()
    if framerate_sample != 16000:
        print(
            'Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'
            .format(framerate_sample),
            file=sys.stderr)
        fin.close()
        return
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1 / 16000)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    result_sub = deepspeech.stt(audio, framerate_sample)
    result = " ".join(
        filter(lambda word: len(word) < 15, result_sub.split(" ")))
    f = open("result_text", 'w')
    f.write(result)
    f.close()
    print("Building top 20 keywords...")
    keyphrases = keywords.extract_keyphrases(result)
    print(keyphrases)
    print("Building summary sentence...")
    print(keywords.extract_summary_sentence(result))

    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
Пример #9
0
# SIU KING WAI SM4701 Deepstory
# Please create a folder _test
import numpy as np
import csv
import scipy.io.wavfile as wav
from deepspeech import Model
import os
import sys
import glob
import pandas as pd
ds = Model('deepspeech-0.7.0-models.pbmm')


def transcribe(path='', array=None, silence=0.3):
    if array is not None:
        audio = array
    else:
        sr, audio = wav.read(path)
        audio = np.pad(audio, (int(sr * silence), int(sr * 0.5)), 'constant')
    return ds.stt(audio)


def main(voice):
    audio_list = glob.glob(f'{voice}_test/*.wav')
    transcription = pd.DataFrame(audio_list, columns=['path'])
    transcription['filename'] = transcription['path'].str.split(
        '.', expand=True)[0].apply(lambda x: os.path.split(x)[1])
    transcription['script'] = transcription['path'].apply(transcribe)
    transcription.drop(columns=['path'], inplace=True)
    transcription.to_csv(f'{voice}_transcription.csv',
                         encoding='utf-8',
Пример #10
0
def main():
    parser = argparse.ArgumentParser(
        description='Running DeepSpeech inference.')
    parser.add_argument('--model',
                        required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument(
        '--alphabet',
        required=True,
        help=
        'Path to the configuration file specifying the alphabet used by the network'
    )
    parser.add_argument('--lm',
                        nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument(
        '--trie',
        nargs='?',
        help=
        'Path to the language model trie file created with native_client/generate_trie'
    )
    parser.add_argument('--audio',
                        required=True,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--version',
                        action=VersionAction,
                        help='Print version and exits')
    parser.add_argument('--extended',
                        required=False,
                        action='store_true',
                        help='Output string from extended metadata')
    args = parser.parse_args()

    print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args.lm and args.trie:
        print('Loading language model from files {} {}'.format(
            args.lm, args.trie),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_ALPHA,
                               LM_BETA)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end),
              file=sys.stderr)

    fin = wave.open(args.audio, 'rb')
    fs = fin.getframerate()
    if fs != 16000:
        print(
            'Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'
            .format(fs),
            file=sys.stderr)
        fs, audio = convert_samplerate(args.audio)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1 / 16000)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    if args.extended:
        print(metadata_to_string(ds.sttWithMetadata(audio, fs)))
    else:
        print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
Пример #11
0
def main():
    start = time.time()
    model_path = constants.fetch_contant('deepspeech', 'model_path')
    alphabet_path = constants.fetch_contant('deepspeech', 'alphabet_path')
    print('Loading model from file {}'.format(model_path), file=sys.stderr)
    ds = Model(model_path, N_FEATURES, N_CONTEXT, alphabet_path, BEAM_WIDTH)
    print('Loaded accoustic model after: ' + str((time.time()) - start),
          file=sys.stderr)
    lm_path = constants.fetch_contant('deepspeech', 'lm_path')
    trie_path = constants.fetch_contant('deepspeech', 'trie_path')
    print('Loading language model from files {} {}'.format(lm_path, trie_path),
          file=sys.stderr)
    ds.enableDecoderWithLM(alphabet_path, lm_path, trie_path, LM_ALPHA,
                           LM_BETA)
    print('Loaded language model after: ' + str((time.time()) - start),
          file=sys.stderr)
    misc.reset_folders([CHUNK_PATH, TASK_PATH, CUST_PATH])
    con = None
    try:
        con = psycopg2.connect(
            "host='192.168.0.102' dbname='sales' user='******' password='******'"
        )
        cur = con.cursor()
        sql = 'delete from benchmark_deepspeech'
        cur.execute(sql)
        con.commit()
        print('Fetching tasks...')
        taskIds = fetchTaskIds()
        print('Fetched ' + str(len(taskIds)) + ' tasks!')
        for taskId in taskIds:
            print('Started ' + str(taskId))
            try:
                downloadResult = downloadTaskAudio(str(taskId))
                if downloadResult["success"]:
                    customerPath = removeAgentChannel(
                        downloadResult["abs_path"])
                    #customerPath = upsampleAudio(customerPath, 16000)
                    snippets = performVAD(customerPath)
                    for snippet in snippets:
                        text = transcribe(snippet.path, ds)
                        text = re.sub("'", "''", text)
                        if len(text) > 0:
                            url = 'http://192.168.0.100:5010/dschunks/' + re.sub(
                                CHUNK_PATH, '', snippet.path)
                            sql = "INSERT INTO public.benchmark_deepspeech (created_at, updated_at, audio_url, "
                            sql += "audio_path, is_verified, ds_transcription, real_transcription, cer, wer, task_id, from_time,"
                            sql += " to_time) VALUES(now(), now(), '" + url + "', '" + snippet.path + "', false, '" + text + "', NULL, NULL, "
                            sql += "NULL, " + str(taskId) + ", " + str(
                                snippet.from_time) + ", " + str(
                                    snippet.to_time) + ");"
                            print(sql)
                            cur.execute(sql)
                            con.commit()
            except Exception as e:
                print(e)
            print('Finished ' + str(taskId))
            #break
    except psycopg2.DatabaseError as e:
        if con:
            con.rollback()
        print(e)
        sys.exit(1)
    finally:
        if con:
            con.close()
Пример #12
0
class InferenceThread(QObject):
    finished = Signal(str)

    def __init__(self):
        super(InferenceThread, self).__init__()
        self.in_queue = queue.Queue()
        self.should_quit = False
        self.worker = threading.Thread(target=self.run)

    def send_cmd(self, cmd):
        ''' Insert command in queue to be processed by the thread '''
        self.in_queue.put(cmd)

    def setQuit(self):
        ''' Signal to the thread that it should stop running '''
        self.should_quit = True

    def start(self):
        self.worker.start()

    def run(self):
        # Creating the model
        self.model = Model(
            os.path.join(os.path.dirname(__file__),
                         "deepspeech-0.5.1-models/output_graph.pbmm"),
            N_FEATURES, N_CONTEXT,
            os.path.join(os.path.dirname(__file__),
                         "deepspeech-0.5.1-models/alphabet.txt"), BEAM_WIDTH)
        self.model.enableDecoderWithLM(
            os.path.join(os.path.dirname(__file__),
                         "deepspeech-0.5.1-models/alphabet.txt"),
            os.path.join(os.path.dirname(__file__),
                         "deepspeech-0.5.1-models/lm.binary"),
            os.path.join(os.path.dirname(__file__),
                         "deepspeech-0.5.1-models/trie"), LM_ALPHA, LM_BETA)
        stream = None

        while True:
            # Try to get the next command from our queue, use a timeout to check
            # periodically for a quit signal so the application doesn't hang on
            # exit.
            try:
                cmd, *data = self.in_queue.get(timeout=0.3)
            except queue.Empty:
                if self.should_quit:
                    break
                # If we haven't received a quit signal just continue trying to
                # get a command from the queue indefinitely
                continue

            if cmd == "start":
                # "start" means create a new stream
                stream = self.model.setupStream()
                logging.debug("Starts to process sound")
            elif cmd == "data":
                # "data" means we received more audio data from the recorder
                if stream:
                    self.model.feedAudioContent(
                        stream, np.frombuffer(data[0].data(), np.int16))
            elif cmd == "finish":
                # "finish" means the caller wants the result of the current stream
                transcript = self.model.finishStream(stream)
                self.finished.emit(transcript)
                stream = None
                logging.debug("Finishes to process sound")
Пример #13
0
def save_checkpoint(state, loss):
    """Save checkpoint if a new best is achieved"""
    fname = "checkpoint_" + time.strftime("%Y%m%d-%H%M%S") + "_" + str(
        loss.item()) + ".pth.tar"
    torch.save(state, get_rel_path(os.path.join(CHECKPOINTS_FOLDER,
                                                fname)))  # save checkpoint
    print("$$$ Saved a new checkpoint\n")


#####################Voice-To-Text##############

#######Deepspeech Voice-To-Text Parameters########

DS_model_file_path = 'deepspeech_data/deepspeech-0.7.0-models.pbmm'
beam_width = 500
DS_model = Model(DS_model_file_path)
DS_model.setBeamWidth(beam_width)
DS_model.enableExternalScorer('deepspeech_data/deepspeech-0.7.0-models.scorer')


def get_text(data, model=DS_model):
    #     y , s = librosa.load(fpath, sr=16000)
    y = (data * 32767).astype('int16')
    text = model.stt(y)
    return text


def get_text_score(phrase1, phrase2):
    return SequenceMatcher(a=phrase1, b=phrase2).ratio()

Пример #14
0
import subprocess
from pydub import AudioSegment

try:
    from shhlex import quote
except ImportError:
    from pipes import quote

model_file_path = '/analyze/model/deepspeech-0.9.3-models.pbmm'
lm_file_path = '/analyze/model/deepspeech-0.9.3-models.scorer'

beam_width = 500
lm_alpha = 0.93
lm_beta = 1.18

model = Model(model_file_path)
model.enableExternalScorer(lm_file_path)

model.setScorerAlphaBeta(lm_alpha, lm_beta)
model.setBeamWidth(beam_width)

def read_wav_file(filename):
    with wave.open(filename, 'rb') as w:
        rate = w.getframerate()
        frames = w.getnframes()
        buffer = w.readframes(frames)
        print(rate)
        print(frames)

    return buffer, rate
Пример #15
0
def main():
    parser = argparse.ArgumentParser(
        description='Running DeepSpeech inference.')
    parser.add_argument('--model',
                        required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--scorer',
                        required=False,
                        help='Path to the external scorer file')
    parser.add_argument('--audio',
                        required=True,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--beam_width',
                        type=int,
                        help='Beam width for the CTC decoder')
    parser.add_argument(
        '--lm_alpha',
        type=float,
        help=
        'Language model weight (lm_alpha). If not specified, use default from the scorer package.'
    )
    parser.add_argument(
        '--lm_beta',
        type=float,
        help=
        'Word insertion bonus (lm_beta). If not specified, use default from the scorer package.'
    )
    parser.add_argument('--version',
                        action=VersionAction,
                        help='Print version and exits')
    parser.add_argument('--extended',
                        required=False,
                        action='store_true',
                        help='Output string from extended metadata')
    parser.add_argument(
        '--json',
        required=False,
        action='store_true',
        help='Output json from metadata with timestamp of each word')
    args = parser.parse_args()

    print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    # sphinx-doc: python_ref_model_start
    ds = Model(args.model)
    # sphinx-doc: python_ref_model_stop
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args.beam_width:
        ds.setModelBeamWidth(args.beam_width)

    desired_sample_rate = ds.sampleRate()

    if args.scorer:
        print('Loading scorer from files {}'.format(args.scorer),
              file=sys.stderr)
        scorer_load_start = timer()
        ds.enableExternalScorer(args.scorer)
        scorer_load_end = timer() - scorer_load_start
        print('Loaded scorer in {:.3}s.'.format(scorer_load_end),
              file=sys.stderr)

        if args.lm_alpha and args.lm_beta:
            ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)

    fin = wave.open(args.audio, 'rb')
    fs_orig = fin.getframerate()
    if fs_orig != desired_sample_rate:
        print(
            'Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'
            .format(fs_orig, desired_sample_rate),
            file=sys.stderr)
        fs_new, audio = convert_samplerate(args.audio, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1 / fs_orig)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    # sphinx-doc: python_ref_inference_start
    if args.extended:
        print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0]))
    elif args.json:
        print(metadata_json_output(ds.sttWithMetadata(audio, 3)))
    else:
        print(ds.stt(audio))
    # sphinx-doc: python_ref_inference_stop
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
Пример #16
0
 def __init__(self):
     self.model = Model(model_path=Path(__file__).parents[1].joinpath(
         'model.pbmm').absolute().as_posix())
Пример #17
0
 def init_deepspeech(self):
     self.ds_model = Model(ds_model_path, ds_features['beam_width'])
     self.ds_model.enableDecoderWithLM(ds_lm_path, ds_trie_path,
                                       ds_features['lm_alpha'],
                                       ds_features['lm_beta'])
     pass
Пример #18
0
class AugmentedSpeech:
    """
    """
    def __init__(self, runVerbose=False):
        self.ds_model = None
        self.osc_client = None
        self.verbose = runVerbose
        pass

    # setting up OSC subsystem
    def init_osc(self, host, port):
        self.osc_client = SimpleUDPClient(host, port)
        pass

    # Setting up deepspeech
    def init_deepspeech(self):
        self.ds_model = Model(ds_model_path, ds_features['beam_width'])
        self.ds_model.enableDecoderWithLM(ds_lm_path, ds_trie_path,
                                          ds_features['lm_alpha'],
                                          ds_features['lm_beta'])
        pass

    # processes a frame of the ODAS tracker
    def __process_odas_frame(self, buffer):
        # get dict of json buffer
        buffer_dict = json.loads(buffer)
        # parse src
        for v in buffer_dict['src']:

            # filter out inactive sources
            if v['activity'] < 0.5:
                continue

            pay_load = []
            pay_load.append(buffer_dict['timeStamp'])
            pay_load.append(v['id'])
            pay_load.append(v['x'])
            pay_load.append(v['y'])
            pay_load.append(v['z'])
            pay_load.append(v['activity'])
            pay_load.append(v['tag'])

            self.osc_client.send_message('/source', pay_load)

    def run(self):
        print('ready ... ')
        buffer = ""
        # we pipe everything to the wrapper
        p = subprocess.Popen(odaslive_cmd,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT)
        for line in iter(p.stdout.readline, b''):
            s = str(line, 'utf-8')
            # a frame can be identified by a closing curly bracket
            if s.find('}\n') == 0:
                buffer += s
                self.__process_odas_frame(buffer)
                buffer = ""
            else:
                buffer += s
        p.stdout.close()
        # pass back return code
        return p.wait()
Пример #19
0
class DeepspeechNode:
    # Two ways to pass in model:
    #     model_path: path the pre-trained model
    #     model: model itself
    #
    # dictionary: output from deepspeech is corrected to the closest word
    #             in the given dictionary on a word-by-word basis
    # commands: complete output from deepspeech is corrected to the
    #           closest phrase in the given commands
    def __init__(self,
                 model=None,
                 model_path=None,
                 commands=None,
                 dictionary=None):
        self.model = model
        if model_path != None:
            self.load_model(model_path)
        self.commands = commands
        self.dictionary = dictionary

    # Default values for n_feaures, n_context, beam_width are from
    # github.com/mozilla/DeepSpeech/blob/master/native_client/python/client.py
    def load_model(self,
                   model_path,
                   n_features=26,
                   n_context=9,
                   beam_width=500):
        model_path += "/"
        alphabet = model_path + "alphabet.txt"
        output_graph = model_path + "output_graph.pb"
        self.model = Model(output_graph, n_features, n_context, alphabet,
                           beam_width)

    # See: en.wikipedia.org/wiki/Levenshtein_distance
    def levenshtein_distance(self, str1, str2):
        if len(str2) > len(str1):
            str2, str1 = str1, str2
        row_count = len(str1)
        col_count = len(str2)

        row_current = range(row_count + 1)
        for row_n in xrange(row_count):
            row_next = [row_n + 1]
            for col_n in xrange(col_count):
                delete_cost = row_current[col_n + 1] + 1
                insert_cost = row_next[col_n] + 1
                substitution_cost = row_current[col_n] + 1
                if str1[row_n] == str2[col_n]:
                    substitution_cost = row_current[col_n]

                row_next.append(
                    min([substitution_cost, delete_cost, insert_cost]))
            row_current = row_next
        return row_current[-1]

    def stt(self, fs, audio):
        assert self.model != None, "a model must be loaded before testing"
        transcription = self.model.stt(audio, fs)

        if self.dictionary != None:
            transcription_words = transcription.split(" ")

            new_transcription = ""
            for transcribed_word in transcription_words:
                distances = [
                    self.levenshtein_distance(transcribed_word, dict_word)
                    for dict_word in self.dictionary
                ]
                min_dist_index = min(xrange(len(distances)),
                                     key=lambda x: distances[x])
                word_guess = self.dictionary[min_dist_index]
                new_transcription += word_guess + " "

            transcription = new_transcription

        if self.commands != None:
            distances = [
                self.levenshtein_distance(transcription, possibility)
                for possibility in self.commands
            ]
            min_dist_index = min(xrange(len(distances)),
                                 key=lambda x: distances[x])
            transcription_guess = self.commands[min_dist_index]
            transcription = transcription_guess

        return transcription
Пример #20
0
def app_sst(model_path: str, lm_path: str, lm_alpha: float, lm_beta: float,
            beam: int):
    webrtc_ctx = webrtc_streamer(
        key="speech-to-text",
        mode=WebRtcMode.SENDONLY,
        audio_receiver_size=1024,
        client_settings=ClientSettings(
            rtc_configuration={
                "iceServers": [{
                    "urls": ["stun:stun.l.google.com:19302"]
                }]
            },
            media_stream_constraints={
                "video": False,
                "audio": True
            },
        ),
    )

    status_indicator = st.empty()

    if not webrtc_ctx.state.playing:
        return

    status_indicator.write("Loading...")
    text_output = st.empty()
    stream = None

    while True:
        if webrtc_ctx.audio_receiver:
            if stream is None:
                from deepspeech import Model

                model = Model(model_path)
                model.enableExternalScorer(lm_path)
                model.setScorerAlphaBeta(lm_alpha, lm_beta)
                model.setBeamWidth(beam)

                stream = model.createStream()

                status_indicator.write("Model loaded.")

            sound_chunk = pydub.AudioSegment.empty()
            try:
                audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=1)
            except queue.Empty:
                time.sleep(0.1)
                status_indicator.write("No frame arrived.")
                continue

            status_indicator.write("Running. Say something!")

            for audio_frame in audio_frames:
                sound = pydub.AudioSegment(
                    data=audio_frame.to_ndarray().tobytes(),
                    sample_width=audio_frame.format.bytes,
                    frame_rate=audio_frame.sample_rate,
                    channels=len(audio_frame.layout.channels),
                )
                sound_chunk += sound

            if len(sound_chunk) > 0:
                sound_chunk = sound_chunk.set_channels(1).set_frame_rate(
                    model.sampleRate())
                buffer = np.array(sound_chunk.get_array_of_samples())
                stream.feedAudioContent(buffer)
                text = stream.intermediateDecode()
                text_output.markdown(f"**Text:** {text}")
        else:
            status_indicator.write("AudioReciver is not set. Abort.")
            break
Пример #21
0
class ModelWrapper(MAXModelWrapper):

    MODEL_NAME = 'MAX Speech to Text Converter'
    DEFAULT_MODEL_PATH = 'assets/models/output_graph.pbmm'
    MODEL_LICENSE = "MPL-2.0"

    MODEL_META_DATA = {
        'id': '{}'.format(MODEL_NAME.lower().replace(' ', '-')),
        'name': MODEL_NAME,
        'description': 'Converts spoken words into text form.',
        'type': 'Speech-To-Text Translation',
        'license': MODEL_LICENSE,
        'source': 'https://developer.ibm.com/exchanges/models/all/max-speech-to-text-converter/'
    }

    N_FEATURES = 26  # number of MFCC features
    N_CONTEXT = 9  # Size of the context window used for producing timesteps in the input vector
    BEAM_WIDTH = 500  # Beam width used in the CTC decoder when building candidate transcriptions
    LM_ALPHA = 0.75  # The alpha hyperparameter of the CTC decoder. Language Model weight
    LM_BETA = 1.85  # The beta hyperparameter of the CTC decoder. Word insertion bonus.

    alphabet_path = 'assets/models/alphabet.txt'
    lm_path = 'assets/models/lm.binary'
    trie_path = 'assets/models/trie'

    def __init__(self, path=DEFAULT_MODEL_PATH):
        logger.info('Loading model from: {}...'.format(path))

        self.model = Model(path, self.N_FEATURES, self.N_CONTEXT, self.alphabet_path, self.BEAM_WIDTH)
        self.model.enableDecoderWithLM(self.alphabet_path, self.lm_path, self.trie_path, self.LM_ALPHA, self.LM_BETA)

        logger.info('Loaded model')

    def _convert_samplerate(self, audio_data, fs):

        resampled_audio = resample(np.frombuffer(audio_data, np.int16).astype(np.float32), fs, 16000)
        return 16000, resampled_audio.astype(np.int16)

    def _read_audio(self, audio_data):

        try:
            fin = wave.open(io.BytesIO(audio_data))
        except (wave.Error, EOFError):
            raise OSError("Error reading the audio file. Only WAV files are supported.")

        if fin.getnchannels() != 1:
            raise OSError("Only mono audio files are supported.")

        fin_len = fin.getnframes() / fin.getframerate()  # num frames / frame rate = length in seconds

        if fin_len > 10:
            raise OSError("This model is designed to work with short (about 5 second) audio files only.")

        return fin

    def _pre_process(self, audio_data):
        fin = self._read_audio(audio_data)
        fs = fin.getframerate()
        if fs != 16000:
            print('Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech '
                  'recognition.'.format(fs), file=sys.stderr)
            fs, audio = self._convert_samplerate(audio_data, fs)
        else:
            audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
        fin.close()
        return audio

    def _post_process(self, preds):
        return preds

    def _predict(self, x):
        preds = self.model.stt(x, 16000)
        return preds
Пример #22
0
def app_sst_with_video(model_path: str, lm_path: str, lm_alpha: float,
                       lm_beta: float, beam: int):
    class AudioProcessor(AudioProcessorBase):
        frames_lock: threading.Lock
        frames: deque

        def __init__(self) -> None:
            self.frames_lock = threading.Lock()
            self.frames = deque([])

        async def recv_queued(self,
                              frames: List[av.AudioFrame]) -> av.AudioFrame:
            with self.frames_lock:
                self.frames.extend(frames)

            # Return empty frames to be silent.
            new_frames = []
            for frame in frames:
                input_array = frame.to_ndarray()
                new_frame = av.AudioFrame.from_ndarray(
                    np.zeros(input_array.shape, dtype=input_array.dtype),
                    layout=frame.layout.name,
                )
                new_frame.sample_rate = frame.sample_rate
                new_frames.append(new_frame)

            return new_frames

    webrtc_ctx = webrtc_streamer(
        key="speech-to-text-w-video",
        mode=WebRtcMode.SENDRECV,
        audio_processor_factory=AudioProcessor,
        client_settings=ClientSettings(
            rtc_configuration={
                "iceServers": [{
                    "urls": ["stun:stun.l.google.com:19302"]
                }]
            },
            media_stream_constraints={
                "video": True,
                "audio": True
            },
        ),
    )

    status_indicator = st.empty()

    if not webrtc_ctx.state.playing:
        return

    status_indicator.write("Loading...")
    text_output = st.empty()
    stream = None

    while True:
        if webrtc_ctx.audio_processor:
            if stream is None:
                from deepspeech import Model

                model = Model(model_path)
                model.enableExternalScorer(lm_path)
                model.setScorerAlphaBeta(lm_alpha, lm_beta)
                model.setBeamWidth(beam)

                stream = model.createStream()

                status_indicator.write("Model loaded.")

            sound_chunk = pydub.AudioSegment.empty()

            audio_frames = []
            with webrtc_ctx.audio_processor.frames_lock:
                while len(webrtc_ctx.audio_processor.frames) > 0:
                    frame = webrtc_ctx.audio_processor.frames.popleft()
                    audio_frames.append(frame)

            if len(audio_frames) == 0:
                time.sleep(0.1)
                status_indicator.write("No frame arrived.")
                continue

            status_indicator.write("Running. Say something!")

            for audio_frame in audio_frames:
                sound = pydub.AudioSegment(
                    data=audio_frame.to_ndarray().tobytes(),
                    sample_width=audio_frame.format.bytes,
                    frame_rate=audio_frame.sample_rate,
                    channels=len(audio_frame.layout.channels),
                )
                sound_chunk += sound

            if len(sound_chunk) > 0:
                sound_chunk = sound_chunk.set_channels(1).set_frame_rate(
                    model.sampleRate())
                buffer = np.array(sound_chunk.get_array_of_samples())
                stream.feedAudioContent(buffer)
                text = stream.intermediateDecode()
                text_output.markdown(f"**Text:** {text}")
        else:
            status_indicator.write("AudioReciver is not set. Abort.")
            break
Пример #23
0
  import unzip_requirements
except ImportError:
  pass

import shlex
import subprocess
import base64
import io
import sys
import wave
import json
import numpy as np

from deepspeech import Model

ds = Model('./model/deepspeech-0.7.1-models.pbmm')
desired_sample_rate = ds.sampleRate()

try:
    from shhlex import quote
except ImportError:
    from pipes import quote


def convert_samplerate(audio_path, desired_sample_rate):
    sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), desired_sample_rate)
    try:
        output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE)
    except subprocess.CalledProcessError as e:
        raise RuntimeError('SoX returned non-zero status: {}'.format(e.stderr))
    except OSError as e:
Пример #24
0
def main():
    parser = argparse.ArgumentParser(
        description='Running DeepSpeech inference.')
    parser.add_argument('--model',
                        required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument(
        '--alphabet',
        required=True,
        help=
        'Path to the configuration file specifying the alphabet used by the network'
    )
    parser.add_argument('--lm',
                        nargs='?',
                        help='Path to the language model binary file')
    parser.add_argument(
        '--trie',
        nargs='?',
        help=
        'Path to the language model trie file created with native_client/generate_trie'
    )
    parser.add_argument('--audio',
                        required=True,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--beam_width',
                        type=int,
                        default=500,
                        help='Beam width for the CTC decoder')
    parser.add_argument('--lm_alpha',
                        type=float,
                        default=0.75,
                        help='Language model weight (lm_alpha)')
    parser.add_argument('--lm_beta',
                        type=float,
                        default=1.85,
                        help='Word insertion bonus (lm_beta)')
    parser.add_argument('--version',
                        action=VersionAction,
                        help='Print version and exits')
    parser.add_argument('--extended',
                        required=False,
                        action='store_true',
                        help='Output string from extended metadata')
    args = parser.parse_args()

    print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(args.model, args.alphabet, args.beam_width)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    desired_sample_rate = ds.sampleRate()

    if args.lm and args.trie:
        print('Loading language model from files {} {}'.format(
            args.lm, args.trie),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(args.lm, args.trie, args.lm_alpha, args.lm_beta)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end),
              file=sys.stderr)

    fin = wave.open(args.audio, 'rb')
    fs = fin.getframerate()
    if fs != desired_sample_rate:
        print(
            'Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'
            .format(fs, desired_sample_rate),
            file=sys.stderr)
        fs, audio = convert_samplerate(args.audio, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1 / fs)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    if args.extended:
        print(metadata_to_string(ds.sttWithMetadata(audio)))
    else:
        print(ds.stt(audio))
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' %
          (inference_end, audio_length),
          file=sys.stderr)
    return 16000, np.frombuffer(output, np.int16)


LM_WEIGHT = 1.5
VALID_WORD_COUNT_WEIGHT = 2.25
N_FEATURES = 26
N_CONTEXT = 9
BEAM_WIDTH = 512

model = '/home/nihadern/voice_recog/models/output_graph.rounded.pbmm'
micro = '/home/nihadern/voice_recog/male.wav'
alpha = '/home/nihadern/voice_recog/models/alphabet.txt'
trie = '/home/nihadern/voice_recog/models/trie'
lm = '/home/nihadern/voice_recog/models/lm.binary'

ds = Model(model, N_FEATURES, N_CONTEXT, alpha,
           BEAM_WIDTH)  #model link, cepstrum, context
ds.enableDecoderWithLM(alpha, lm, trie, LM_WEIGHT, VALID_WORD_COUNT_WEIGHT)
print('\nModel ok')
print('\nreading voice')

fin = wave.open(micro, 'rb')
fs = fin.getframerate()
if fs != 16000:
    print(
        'Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'
        .format(fs),
        file=sys.stderr)
    fs, audio = convert_samplerate(micro)
else:
    audio = naudiop.frombuffer(fin.readframes(fin.getnframes()), np.int16)
Пример #26
0
dirname2 = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
import garbagecollector as gc

sample_rate = 16000
beam_width = 500
lm_alpha = 0.75
lm_beta = 1.85
n_features = 26
n_context = 9

model_name = dirname2+"/models/output_graph.pbmm"
alphabet = dirname2+"/models/alphabet.txt"
langauage_model = dirname2+"/models/lm.binary"
trie = dirname2+"/models/trie"

ds = Model(model_name,beam_width)
try:
	ds.enableDecoderWithLM(langauage_model, trie, lm_alpha, lm_beta)
except:
	print("No language model and trie specified")

def downsampleWav(src, dst, inrate=44100, outrate=sample_rate, inchannels=2, outchannels=1):
    if not os.path.exists(src):
        return ('Source not found!',False)

    if not os.path.exists(os.path.dirname(dst)):
        os.makedirs(os.path.dirname(dst))

    try:
        s_read = wave.open(src, 'r')
        s_write = wave.open(dst, 'w')
Пример #27
0
from deepspeech import Model
for i in range(5):
    ds = Model('/models/mozilla/deepspeech-0.7.3-models.pbmm')
    ds.enableExternalScorer('/models/mozilla/deepspeech-0.7.3-models.scorer')
    ds.setScorerAlphaBeta(0.75, 1.85)
    ds.__del__()
from flask import Flask
from flask import jsonify
from flask import request
from flask_cors import CORS, cross_origin

BEAM_WIDTH = 1024
LM_WEIGHT = 0.75
VALID_WORD_COUNT_WEIGHT = 1.85
N_FEATURES = 26
N_CONTEXT = 9
MODEL_FILE = 'models/output_graph.pbmm'
ALPHABET_FILE = 'models/alphabet.txt'
LANGUAGE_MODEL =  'models/lm.binary'
TRIE_FILE =  'models/trie'

ds = Model(MODEL_FILE, N_FEATURES, N_CONTEXT, ALPHABET_FILE, BEAM_WIDTH)
ds.enableDecoderWithLM(ALPHABET_FILE, LANGUAGE_MODEL, TRIE_FILE, LM_WEIGHT, VALID_WORD_COUNT_WEIGHT)

app = Flask(__name__)
cors = CORS(app)
app.config['CORS_HEADERS'] = 'Content-Type'

@app.route('/', methods=['POST'])
@cross_origin()
def post():
    fileName = 'file_'+str(uuid.uuid4())+'.wav'
    with open(fileName, "wb") as vid:
        vid.write(request.data)

    fs, audio = scipy.io.wavfile.read(fileName)
    processed_data = ds.stt(audio, fs)
Пример #29
0
    try:
        output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE)
    except subprocess.CalledProcessError as e:
        raise RuntimeError('SoX returned non-zero status: {}'.format(e.stderr))
    except OSError as e:
        raise OSError(e.errno, 'SoX not found, use {}hz files or install it: {}'.format(desired_sample_rate, e.strerror))

    return desired_sample_rate, np.frombuffer(output, np.int16)




print('Loading model from file {}'.format(model))
model_load_start = timer()
# sphinx-doc: python_ref_model_start
ds = Model(model)
# sphinx-doc: python_ref_model_stop
model_load_end = timer() - model_load_start
print('Loaded model in {:.3}s.'.format(model_load_end))

if beam_width:
    ds.setBeamWidth(beam_width)

desired_sample_rate = ds.sampleRate()

if scorer:
    print('Loading scorer from files {}'.format(scorer))
    scorer_load_start = timer()
    ds.enableExternalScorer(scorer)
    scorer_load_end = timer() - scorer_load_start
    print('Loaded scorer in {:.3}s.'.format(scorer_load_end))
Пример #30
0
# Number of MFCC features to use
N_FEATURES = 26

# Size of the context window used for producing timesteps in the input vector
N_CONTEXT = 9

# How aggressive to be when splitting audio files into chunks
aggressiveness = 1

UPLOAD_FOLDER = '/tmp'
ALLOWED_EXTENSIONS = set(['wav', 'mp3', 'flac'])

app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
ds = Model('models/output_graph.pbmm', N_FEATURES, N_CONTEXT,
           'models/alphabet.txt', BEAM_WIDTH)
ds.enableDecoderWithLM('models/alphabet.txt', 'models/lm.binary',
                       'models/trie', LM_WEIGHT, VALID_WORD_COUNT_WEIGHT)
api_keys = []
api_keyfile = 'api_keys.txt'
transcription_in_progress = False
print(transcription_in_progress)


def load_keys(keylist):
    with open(keylist) as f:
        for line in f:
            credential = line.split(', ')
            api_keys.append(credential[0])

Пример #31
0
def main():
    global line_count
    print("AutoSub v0.1\n")

    parser = argparse.ArgumentParser(description="AutoSub v0.1")
    parser.add_argument('--model', required=True, help='DeepSpeech model file')
    parser.add_argument('--scorer', help='DeepSpeech scorer file')
    parser.add_argument('--file', required=True, help='Input video file')
    args = parser.parse_args()

    ds_model = args.model
    if not ds_model.endswith(".pbmm"):
        print("Invalid model file. Exiting\n")
        exit(1)

    # Load DeepSpeech model
    ds = Model(ds_model)

    if args.scorer:
        ds_scorer = args.scorer
        if not ds_scorer.endswith(".scorer"):
            print(
                "Invalid scorer file. Running inference using only model file\n"
            )
        else:
            ds.enableExternalScorer(ds_scorer)

    input_file = args.file
    print("\nInput file:", input_file)

    base_directory = os.getcwd()
    output_directory = os.path.join(base_directory, "output")
    audio_directory = os.path.join(base_directory, "audio")
    video_file_name = input_file.split("/")[-1].split(".")[0]
    audio_file_name = os.path.join(audio_directory, video_file_name + ".wav")
    srt_file_name = os.path.join(output_directory, video_file_name + ".srt")

    # Extract audio from input video file
    extract_audio(input_file, audio_file_name)

    print("Splitting on silent parts in audio file")
    silenceRemoval(audio_file_name)

    # Output SRT file
    file_handle = open(srt_file_name, "a+")

    print("\nRunning inference:")

    for file in tqdm(sort_alphanumeric(os.listdir(audio_directory))):
        audio_segment_path = os.path.join(audio_directory, file)

        # Dont run inference on the original audio file
        if audio_segment_path.split("/")[-1] != audio_file_name.split("/")[-1]:
            ds_process_audio(ds, audio_segment_path, file_handle)

    print("\nSRT file saved to", srt_file_name)
    file_handle.close()

    # Clean audio/ directory
    shutil.rmtree(audio_directory)
    os.mkdir(audio_directory)
class DetectAndDeter:
    CLASSIFICATION_COUNT = 5
    TELEMARKETER_THRESH = 0.3
    VALID_CALLER_THRESH = 0.1
    IN_AUDIO_RATE = 8000
    DS_AUDIO_RATE = 16000
    MOZILLA_TTS_AUDIO_RATE = 22050
    QUIET_THRESH = 150
    QUIET_LENGTH = 3000

    def __init__(self, name):
        self.name = name  # user's name  e.g. "Bob Ross"
        self.valid_caller_event = Event()
        self.caller_audio_chunk = np.array([], dtype='int16')

        self.audio_in_queue = Queue()
        self.stt_to_classification_queue = Queue()
        self.stt_to_chatbot_queue = Queue()
        self.chatbot_to_tts_queue = Queue()
        self.audio_out_queue = Queue()

        self.manager = Manager()
        self.transcript = self.manager.list()
        self.is_telemarketer = self.manager.Value("is_telemarketer", None)

        self.deep_speech = None
        self.mozilla_tts = None

        self.final_transcript = None
        self.final_predictions = None

        self.speech_to_text_thread = Process(target=self.speech_to_text)
        self.classify_text_thread = Process(target=self.classify_text)
        self.generate_response_thread = Process(target=self.generate_responses)
        self.text_to_speech_thread = Process(target=self.text_to_speech)

        self.log = {
            "start": None,
            "end": None,
            "version": CONFIG['version'],
            "transcript": [],
            "is_telemarketer": None,
            "caller": None
        }

    @property
    def queues(self):
        return self.audio_in_queue, self.audio_out_queue

    def start(self):
        self.speech_to_text_thread.start()
        self.classify_text_thread.start()
        self.generate_response_thread.start()
        self.text_to_speech_thread.start()

        self.log["start"] = dt.datetime.now().isoformat()

    def close(self):
        self.log["transcript"] = [value for value in self.transcript]
        self.log["is_telemarketer"] = self.is_telemarketer.value
        self.log["end"] = dt.datetime.now().isoformat()

        self.speech_to_text_thread.terminate()
        self.speech_to_text_thread.join()
        self.speech_to_text_thread.close()

        self.classify_text_thread.terminate()
        self.classify_text_thread.join()
        self.classify_text_thread.close()

        self.generate_response_thread.terminate()
        self.generate_response_thread.join()
        self.generate_response_thread.close()

        self.text_to_speech_thread.terminate()
        self.text_to_speech_thread.join()
        self.text_to_speech_thread.close()

    def fill_log_info(self, caller_number):
        self.log['caller'] = caller_number
        return self.log

    def classify_text(self):
        predictions = []
        while self.is_telemarketer.value is None:
            idx = self.stt_to_classification_queue.get()
            text = self.transcript[idx]['text']

            preds = model.predict(text)
            transcript_line = self.transcript[idx]
            transcript_line['analysis'] = {
                "prediction": str(preds[0]).lower(),
                "confidence": float(max(preds[2]))
            }
            self.transcript[idx] = transcript_line
            predictions.append(str(preds[0]).lower())

            maybe_telemarketer = predictions.count("persuasion") / len(
                predictions)

            if len(predictions) > self.CLASSIFICATION_COUNT:
                print("CLASS")
                print(maybe_telemarketer, self.TELEMARKETER_THRESH,
                      self.VALID_CALLER_THRESH)
                if maybe_telemarketer > self.TELEMARKETER_THRESH:
                    self.is_telemarketer.value = True
                    break
                elif maybe_telemarketer < self.VALID_CALLER_THRESH:
                    self.is_telemarketer.value = False
                    # self.is_telemarketer.set()
                    break

        if not self.is_telemarketer.value:
            self.valid_caller_event.set()

    def generate_responses(self):
        while True:
            text = self.stt_to_chatbot_queue.get()
            print("Generate Response:", text)
            response = str(chatbot.get_response(text))

            self.chatbot_to_tts_queue.put(response)

    def text_to_speech(self):
        tts_config = CONFIG['tts_config']
        models_folder = Path(tts_config['folder'])

        model_path = str(models_folder / tts_config['model'])
        model_config_path = str(models_folder / tts_config['model_config'])
        vocoder_path = str(models_folder / tts_config['vocoder'])
        vocoder_config_path = str(models_folder / tts_config['vocoder_config'])

        self.mozilla_tts = Synthesizer(model_path, model_config_path,
                                       vocoder_path, vocoder_config_path)

        while True:
            response = self.chatbot_to_tts_queue.get()
            print("TTS:", response)

            sound_arr = np.array(self.mozilla_tts.tts(response))

            sound_arr *= 2**15
            sound_arr = sound_arr.astype('int16')

            sound = bytes(sound_arr)
            sound, _ = audioop.ratecv(sound, 2, 1, self.MOZILLA_TTS_AUDIO_RATE,
                                      self.IN_AUDIO_RATE, None)

            ulaw_sound = audioop.lin2ulaw(sound, 2)

            chunk_len = 540
            chunks = len(ulaw_sound) // chunk_len
            extra = len(ulaw_sound) - (chunks * chunk_len)

            for c in range(chunks):
                chunk = ulaw_sound[c * chunk_len:c * chunk_len + chunk_len]
                self.audio_out_queue.put(
                    base64.b64encode(chunk).decode('utf-8'))

            if extra != 0:
                chunk = ulaw_sound[-extra:]
                self.audio_out_queue.put(
                    base64.b64encode(chunk).decode('utf-8'))

            self.transcript.append({
                "speaker": "self",
                "text": response,
                "datetime": dt.datetime.now().isoformat()
            })

    def speech_to_text(self):
        stt_config = CONFIG['stt_config']
        models_folder = Path(stt_config['folder'])
        model_path = str(models_folder / stt_config['model'])
        scorer_path = str(models_folder / stt_config['scorer'])

        self.deep_speech = Model(model_path)
        self.deep_speech.enableExternalScorer(scorer_path)

        stream = self.deep_speech.createStream()

        while True:
            speech = self.audio_in_queue.get()

            while not self.audio_in_queue.empty():
                speech += self.audio_in_queue.get()

            lin_speech = audioop.ulaw2lin(speech, 2)
            ds_speech, _ = audioop.ratecv(lin_speech, 2, 1, self.IN_AUDIO_RATE,
                                          self.DS_AUDIO_RATE, None)

            lin_speech_arr = np.frombuffer(lin_speech, np.int16)
            ds_speech_arr = np.frombuffer(ds_speech, np.int16)

            stream.feedAudioContent(ds_speech_arr)

            self.caller_audio_chunk = np.concatenate(
                (self.caller_audio_chunk, lin_speech_arr))

            chunk_idx = max(0,
                            len(self.caller_audio_chunk) - self.QUIET_LENGTH)
            quiet_chunk = self.caller_audio_chunk[chunk_idx:]
            if (quiet_chunk < self.QUIET_THRESH).all() and (
                    self.caller_audio_chunk > self.QUIET_THRESH).any():
                text = stream.intermediateDecode()

                if text.strip():
                    self.stt_to_chatbot_queue.put(text)

                    idx = len(self.transcript
                              )  # insert to avoid race conditions with indexes
                    self.transcript.insert(
                        idx, {
                            "speaker": "caller",
                            "text": text,
                            "datetime": dt.datetime.now().isoformat()
                        })
                    self.stt_to_classification_queue.put(idx)

                    stream.finishStream()
                    stream = self.deep_speech.createStream()

                self.caller_audio_chunk = np.array([], dtype='int16')

    def make_greeting(self, one_party_consent):
        self.chatbot_to_tts_queue.put(
            f"Hi. This is {self.name} how may I help you?")

        if not one_party_consent:
            self.chatbot_to_tts_queue.put("Keep in mind, I record all calls")
Пример #33
0
parser = argparse.ArgumentParser(description=None)
parser.add_argument('--input',
                    type=str,
                    dest="input",
                    required=True,
                    help="Input audio .wav file at 16KHz")
args = parser.parse_args()
while len(sys.argv) > 1:
    sys.argv.pop()

BEAM_WIDTH = 500
LM_ALPHA = 0.75
LM_BETA = 1.85
N_FEATURES = 26
N_CONTEXT = 9
MODEL_PATH = 'deepspeech-0.4.1-checkpoint/models/output_graph.pb'
ALPHABET_PATH = 'deepspeech-0.4.1-checkpoint/models/alphabet.txt'
LM_PATH = 'deepspeech-0.4.1-checkpoint/models/lm.binary'
TRIE_PATH = 'deepspeech-0.4.1-checkpoint/models/trie'

ds = Model(MODEL_PATH, N_FEATURES, N_CONTEXT, ALPHABET_PATH, BEAM_WIDTH)
ds.enableDecoderWithLM(ALPHABET_PATH, LM_PATH, TRIE_PATH, LM_ALPHA, LM_BETA)

# Audio Path
AUDIO_PATH = args.input
# Read soundfile should say experience
audio_data, sample_rate = soundfile.read(AUDIO_PATH, dtype='int16')

print(audio_data)
print(ds.stt(audio_data, sample_rate))
Пример #34
0
lastDetection = 0
startRecording = 0
endRecording = 0
newAudioDetected = False

audio = pyaudio.PyAudio()  #instantiate the pyaudio

#recording prerequisites
stream = audio.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)

#Carrega o modelo
ds = Model(sys.argv[1], 500)

#Inicia a gravacao
frames = []

#inicia a deteccao
newAudioDetected = False
lastDetection = current_milli_time()

while True:
    #Captura o audio
    data = stream.read(CHUNK)
    data_chunk = array('h', data)
    vol = max(data_chunk)
    #Deteccao de volume:
    if (vol >= 800):
Пример #35
0
def transcriber(inputdir, md, lm_alpha, lm_beta):
    toolbar_width = 40
    inputdir_len = len(fnmatch.filter(os.listdir(path + inputdir), '*.wav'))
    print(inputdir_len)
    if (inputdir_len > toolbar_width):
        incriment = math.trunc(inputdir_len / toolbar_width)
    else:
        toolbar_width = inputdir_len
        incriment = 1
    count = 1
    outputList = []

    BEAM_WIDTH = 500

    deep = Model(md + '/output_graph.pbmm', BEAM_WIDTH)
    enabled = deep.enableDecoderWithLM(md + '/lm.binary', md + '/trie',
                                       lm_alpha, lm_beta)
    print('Decoder Enabled <0=true>:', enabled)

    # setup progress bar
    sys.stdout.write("[%s]" % (" " * toolbar_width))
    sys.stdout.flush()
    sys.stdout.write("\b" *
                     (toolbar_width + 1))  # return to start of line, after '['

    filelist = sorted(os.listdir(path + inputdir))
    # writes filelist to an output file
    with open(
            os.path.join(path + inputdir,
                         inputdir.rsplit('/', 1)[1] + '_filelist.txt'),
            'w') as writer:
        writer.writelines('\n'.join(filelist) + '\n')

    for filename in filelist:
        with open(os.path.join(path + inputdir, filename),
                  'r') as f:  # open in readonly mode

            # if the file is not a wav file skip
            if filename.rsplit('.', 1)[1] != "wav":
                continue

            # Check if filename end in _mono.wav, if yes, skip the file
            if filename.rsplit('_', 1)[1] == "mono.wav":
                continue

            # Check if the file has an associated _mono.wav file in the directory
            if filename.rsplit(
                    '.', 1)[0] + "_mono.wav" in os.listdir(path + inputdir):
                #if yes, prepare the _mono.wav file
                fs, audio = sound_utils.prepare_input(
                    path + inputdir + "/" + filename.rsplit('.', 1)[0] +
                    "_mono.wav")
            else:
                #if no, create an _mono.wav file and prepare that file
                sound_utils.stereo_to_mono(path + inputdir + "/" + filename)
                fs, audio = sound_utils.prepare_input(
                    path + inputdir + "/" + filename.rsplit('.', 1)[0] +
                    "_mono.wav")

            # run prepared audio through DeepSpeech
            result = deep.stt(audio)
            # remove generated processed file
            os.remove(path + inputdir + "/" + filename.rsplit('.', 1)[0] +
                      "_mono.wav")
            # add the result to the outputList
            outputList.append(result + "\n")

            # progress bar incriment
            if (count == incriment):
                sys.stdout.write("-")
                sys.stdout.flush()
                count = 0
            count += 1

    sys.stdout.write("]\n")  # this ends the progress bar

    # writes results to an output file
    with open(
            os.path.join(path + inputdir,
                         inputdir.rsplit('/', 1)[1] + '_output.txt'),
            'w') as writer:
        writer.writelines(outputList)

    return