示例#1
0
def create_model(engine, model, scorer):
    """Instantiate model and scorer

    Args:
        engine : "ds" for DeepSpeech and "stt" for Coqui STT
        model : .pbmm model file
        scorer : .scorer file
    """

    try:
        if engine == "ds":
            ds = DModel(model)
        else:
            ds = SModel(model)
    except:
        _logger.error("Invalid model file")
        sys.exit(1)

    try:
        ds.enableExternalScorer(scorer)
    except:
        _logger.warn(
            "Invalid scorer file. Running inference using only model file")
    return (ds)
示例#2
0
def load(model, scorer, verbose=True, beam_width="", lm_alpha="", lm_beta="", hot_words=""):
    """ Load models"""

    model_load_start = timer()
    # sphinx-doc: python_ref_model_start
    ds = Model(model)
    # sphinx-doc: python_ref_model_stop
    model_load_end = timer() - model_load_start
    if verbose==True:
        print('\nLoading model from files {}'.format(model), file=sys.stderr)
        print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if beam_width:
        ds.setBeamWidth(beam_width)

    desired_sample_rate = ds.sampleRate()

    if scorer:
        if verbose == True:
            print('Loading scorer from files {}'.format(scorer), file=sys.stderr)
        scorer_load_start = timer()
        ds.enableExternalScorer(scorer)
        scorer_load_end = timer() - scorer_load_start
        if verbose == True:
            print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr)

        if lm_alpha and lm_beta:
            ds.setScorerAlphaBeta(lm_alpha, lm_beta)

    if hot_words:
        if verbose == True:
            print('Adding hot-words', file=sys.stderr)
        for word_boost in hot_words.split(','):
            word, boost = word_boost.split(':')
            ds.addHotWord(word, float(boost))
    return ds, desired_sample_rate
示例#3
0
    def __init__(self, ):

        print('Loading model from file {}'.format(args.model), file=sys.stderr)
        model_load_start = timer()
        # sphinx-doc: python_ref_model_start
        model_path = os.path.dirname(os.path.abspath(__file__))

        ds = Model(os.path.join(model_path, args.model))
        # sphinx-doc: python_ref_model_stop
        model_load_end = timer() - model_load_start
        print('Loaded model in {:.3}s.'.format(model_load_end),
              file=sys.stderr)

        if args.beam_width:
            ds.setBeamWidth(args.beam_width)

        self.desired_sample_rate = ds.sampleRate()

        if args.scorer:
            print('Loading scorer from files {}'.format(args.scorer),
                  file=sys.stderr)
            scorer_load_start = timer()
            ds.enableExternalScorer(os.path.join(model_path, args.scorer))
            scorer_load_end = timer() - scorer_load_start
            print('Loaded scorer in {:.3}s.'.format(scorer_load_end),
                  file=sys.stderr)

            if args.lm_alpha and args.lm_beta:
                ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)

        if args.hot_words:
            print('Adding hot-words', file=sys.stderr)
            for word_boost in args.hot_words.split(','):
                word, boost = word_boost.split(':')
                ds.addHotWord(word, float(boost))
        self.ds = ds
示例#4
0
                    help='Path to the .scorer file')
parser.add_argument('--beam_width', type=int, default=500,
                    help='Beam width for the CTC decoder')
parser.add_argument('--port', type=int, default=8008,
                    help='The port number to listen on')
args = parser.parse_args()

# Load in the model
logging.info("Loading model from %s" % args.model)
model = Model(args.model)

# Configure it
model.setBeamWidth(args.beam_width)
if args.scorer:
    logging.info("Loading scorer from %s" % (args.scorer,))
    model.enableExternalScorer(args.scorer)

# Set up the server socket
logging.info("Opening socket on port %d" % (args.port,))
sckt = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sckt.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
sckt.bind(('0.0.0.0', args.port))
sckt.listen(5)

# Do this forever
while True:
    try:
        # Get a connection
        logging.info("Waiting for a connection")
        (conn, addr) = sckt.accept()
        logging.info("Got connection from %s" % (addr,))
示例#5
0
import os
import wave
import numpy as np
import sys
import shlex
import subprocess
from deepspeech import Model
from tqdm import tqdm

try:
    from shhlex import quote
except ImportError:
    from pipes import quote

model = Model("deepspeech-0.9.3-models.pbmm")
model.enableExternalScorer("deepspeech-0.9.3-models.scorer")
desired_sample_rate = model.sampleRate()
PATH = os.path.join("LJSpeech-1.1", "wavs")
TOTAL_SAMPLES = 100


def convert_samplerate(audio_path, desired_sample_rate):
    sox_cmd = "sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - ".format(
        quote(audio_path), desired_sample_rate)
    try:
        output = subprocess.check_output(shlex.split(sox_cmd),
                                         stderr=subprocess.PIPE)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("SoX returned non-zero status: {}".format(e.stderr))
    except OSError as e:
        raise OSError(
def app_sst_with_video(model_path: str, lm_path: str, lm_alpha: float,
                       lm_beta: float, beam: int):
    class AudioProcessor(AudioProcessorBase):
        frames_lock: threading.Lock
        frames: deque

        def __init__(self) -> None:
            self.frames_lock = threading.Lock()
            self.frames = deque([])

        async def recv_queued(self,
                              frames: List[av.AudioFrame]) -> av.AudioFrame:
            with self.frames_lock:
                self.frames.extend(frames)

            # Return empty frames to be silent.
            new_frames = []
            for frame in frames:
                input_array = frame.to_ndarray()
                new_frame = av.AudioFrame.from_ndarray(
                    np.zeros(input_array.shape, dtype=input_array.dtype),
                    layout=frame.layout.name,
                )
                new_frame.sample_rate = frame.sample_rate
                new_frames.append(new_frame)

            return new_frames

    webrtc_ctx = webrtc_streamer(
        key="speech-to-text-w-video",
        mode=WebRtcMode.SENDRECV,
        audio_processor_factory=AudioProcessor,
        rtc_configuration={
            "iceServers": [{
                "urls": ["stun:stun.l.google.com:19302"]
            }]
        },
        media_stream_constraints={
            "video": True,
            "audio": True
        },
    )

    status_indicator = st.empty()

    if not webrtc_ctx.state.playing:
        return

    status_indicator.write("Loading...")
    text_output = st.empty()
    stream = None

    while True:
        if webrtc_ctx.audio_processor:
            if stream is None:
                from deepspeech import Model

                model = Model(model_path)
                model.enableExternalScorer(lm_path)
                model.setScorerAlphaBeta(lm_alpha, lm_beta)
                model.setBeamWidth(beam)

                stream = model.createStream()

                status_indicator.write("Model loaded.")

            sound_chunk = pydub.AudioSegment.empty()

            audio_frames = []
            with webrtc_ctx.audio_processor.frames_lock:
                while len(webrtc_ctx.audio_processor.frames) > 0:
                    frame = webrtc_ctx.audio_processor.frames.popleft()
                    audio_frames.append(frame)

            if len(audio_frames) == 0:
                time.sleep(0.1)
                status_indicator.write("No frame arrived.")
                continue

            status_indicator.write("Running. Say something!")

            for audio_frame in audio_frames:
                sound = pydub.AudioSegment(
                    data=audio_frame.to_ndarray().tobytes(),
                    sample_width=audio_frame.format.bytes,
                    frame_rate=audio_frame.sample_rate,
                    channels=len(audio_frame.layout.channels),
                )
                sound_chunk += sound

            if len(sound_chunk) > 0:
                sound_chunk = sound_chunk.set_channels(1).set_frame_rate(
                    model.sampleRate())
                buffer = np.array(sound_chunk.get_array_of_samples())
                stream.feedAudioContent(buffer)
                text = stream.intermediateDecode()
                text_output.markdown(f"**Text:** {text}")
        else:
            status_indicator.write("AudioReciver is not set. Abort.")
            break
示例#7
0
def main():
    parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    parser.add_argument('--model', required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--scorer', required=False,
                        help='Path to the external scorer file')
    parser.add_argument('--audio', required=True,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--beam_width', type=int,
                        help='Beam width for the CTC decoder')
    parser.add_argument('--lm_alpha', type=float,
                        help='Language model weight (lm_alpha). If not specified, use default from the scorer package.')
    parser.add_argument('--lm_beta', type=float,
                        help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.')
    parser.add_argument('--version', action=VersionAction,
                        help='Print version and exits')
    parser.add_argument('--extended', required=False, action='store_true',
                        help='Output string from extended metadata')
    parser.add_argument('--json', required=False, action='store_true',
                        help='Output json from metadata with timestamp of each word')
    parser.add_argument('--candidate_transcripts', type=int, default=3,
                        help='Number of candidate transcripts to include in JSON output')
    args = parser.parse_args()

#     print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    # sphinx-doc: python_ref_model_start
    ds = Model(args.model)
    # sphinx-doc: python_ref_model_stop
    model_load_end = timer() - model_load_start
#     print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args.beam_width:
        ds.setBeamWidth(args.beam_width)

    desired_sample_rate = ds.sampleRate()

    if args.scorer:
#         print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr)
        scorer_load_start = timer()
        ds.enableExternalScorer(args.scorer)
        scorer_load_end = timer() - scorer_load_start
#         print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr)

        if args.lm_alpha and args.lm_beta:
            ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)

    fin = wave.open(args.audio, 'rb')
    fs_orig = fin.getframerate()
    if fs_orig != desired_sample_rate:
#         print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs_orig, desired_sample_rate), file=sys.stderr)
        fs_new, audio = convert_samplerate(args.audio, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/fs_orig)
    fin.close()

#     print('Running inference.', file=sys.stderr)
    inference_start = timer()
    # sphinx-doc: python_ref_inference_start
    if args.extended:
        print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0]))
    elif args.json:
        print(metadata_json_output(ds.sttWithMetadata(audio, args.candidate_transcripts)))
    else:
        print("Translation: "+ds.stt(audio))
    # sphinx-doc: python_ref_inference_stop
    inference_end = timer() - inference_start
def _get_model():
    ds = Model("data/quran/output_graph.pb")
    ds.enableExternalScorer("data/quran/lm/quran.scorer")
    return ds
示例#9
0
class DeepSpeechInput(AudioInput):
    """
    Input from DeepSpeech using the US English language model.
    """
    def __init__(self,
                 notifier,
                 rate=None,
                 wav_dir=None,
                 model=os.path.join(_MODEL_DIR, 'models.pbmm'),
                 scorer=os.path.join(_MODEL_DIR, 'models.scorer')):
        """
        @see AudioInput.__init__()

        :type  rate:
        :param rate:
            The override for the rate, if not the model's one.
        :type  wav_dir:
        :param wav_dir:
            Where to save the wave files, if anywhere.
        :type  model:
        :param model:
            The path to the DeepSpeech model file.
        :type  scorer:
        :param scorer:
            The path to the DeepSpeech scorer file.
        """
        # If these don't exist then DeepSpeech will segfault when inferring!
        if not os.path.exists(model):
            raise IOError("Not found: %s" % (model, ))

        # Load in and configure the model.
        LOG.info("Loading model from %s" % (model, ))
        self._model = Model(model)
        if os.path.exists(scorer):
            LOG.info("Loading scorer from %s" % (scorer, ))
            self._model.enableExternalScorer(scorer)

        # Handle any rate override
        if rate is None:
            rate = self._model.sampleRate()

        # Wen can now init the superclass
        super(DeepSpeechInput, self).__init__(notifier,
                                              format=pyaudio.paInt16,
                                              channels=1,
                                              rate=rate,
                                              wav_dir=wav_dir)

        # Where we put the stream context
        self._context = None

    def _feed_raw(self, data):
        """
        @see AudioInput._feed_raw()
        """
        if self._context is None:
            self._context = self._model.createStream()
        audio = numpy.frombuffer(data, numpy.int16)
        self._context.feedAudioContent(audio)

    def _decode(self):
        """
        @see AudioInput._decode()
        """
        if self._context is None:
            # No context means no tokens
            LOG.warning("Had no stream context to close")
            tokens = []
        else:
            # Finish up by finishing the decoding
            words = self._context.finishStream()
            LOG.info("Got: %s" % (words, ))
            self._context = None

            # And tokenize
            tokens = [
                Token(word.strip(), 1.0, True) for word in words.split(' ')
                if len(word.strip()) > 0
            ]
        return tokens
def main():
    if(args.min>=args.max):
        print("Error: min_prio can't be bigger than max_prio.")
    else:
        test_file(args.audio, args.hot_words.split(','), args.min, args.max, args.steps)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='DeepSpeech hot-word adjusting.')
    parser.add_argument('--model', required=True,
                    help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--scorer', required=True,
                    help='Path to the external scorer file')
    parser.add_argument('--audio', type=str, required=True,
                    help='Path to the audio file to run (WAV format)')
    parser.add_argument('--min', type=float, default=-10.0,
                    help='Minimum boost value.')
    parser.add_argument('--max', type=float, default=10.0,
                    help='Maximum boost value.')
    parser.add_argument('--steps', type=int, default=6,
                    help='Number of tests per each hot-word.')
    parser.add_argument('--hot_words', type=str, required=True,
                    help='Hot-words separated by comma.')

    args = parser.parse_args()

    DeepSpeech = Model(args.model)
    DeepSpeech.enableExternalScorer(args.scorer)

    main()
示例#11
0
def load_model(model_path, scorer_path):
    model = Model(model_path)
    model.enableExternalScorer(scorer_path)
    return model
示例#12
0
def load_model(model_file, scorer_file):
    ds = Model(model_file)

    ds.enableExternalScorer(scorer_file)
    return ds
示例#13
0
# Enable logging
logging.basicConfig(
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
)

logger = logging.getLogger("DeepspeechBot")

TOKEN = os.environ.get("TELEGRAM_TOKEN")
BASE_PATH = os.path.abspath(os.path.dirname(sys.argv[0]))
AUDIO_FILE_PATH = BASE_PATH + "/tmp/{}_{}.{}"
MODEL_PATH = os.environ.get("MODEL_PATH", BASE_PATH + "/model/model.pbmm")
SCORER_PATH = os.environ.get("SCORER_PATH", BASE_PATH + "/model/kenlm.scorer")

ds = Model(MODEL_PATH)
ds.enableExternalScorer(SCORER_PATH)


def start(update, context):
    update.message.reply_text(
        "Hola\! Sóc un bot creat per a provar les capacitats de reconeixement automàtic de la parla "
        "del motor DeepSpeech de Mozilla amb les dades de CommonVoice\. Envia'm un missatge de veu i "
        "el transcriuré\. Pots trobar més informació sobre el model català [aquí](https://github.com/ccoreilly/deepspeech-catala)\.",
        parse_mode="MarkdownV2",
    )


def info(update, context):
    update.message.reply_text("Envia'm un missatge de veu i el transcriuré")

示例#14
0
 def load_model(self, model_path, scorer_path):
     ds = Model(model_path)
     self.desired_sample_rate = ds.sampleRate()
     ds.enableExternalScorer(scorer_path)
     self.ds = ds
示例#15
0
def load_model(graph_path, scorer):
    ds = Model(graph_path)
    ds.enableExternalScorer(scorer)
    return ds
示例#16
0
def main():
    parser = argparse.ArgumentParser(description='Running DeepSpeech inference.')
    parser.add_argument('--model', required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--scorer', required=False,
                        help='Path to the external scorer file')
    parser.add_argument('--audio', required=True,
                        help='Path to the audio file to run (WAV format)')
    parser.add_argument('--beam_width', type=int,
                        help='Beam width for the CTC decoder')
    parser.add_argument('--lm_alpha', type=float,
                        help='Language model weight (lm_alpha). If not specified, use default from the scorer package.')
    parser.add_argument('--lm_beta', type=float,
                        help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.')
    # parser.add_argument('--version', action=VersionAction,
    #                     help='Print version and exits')
    parser.add_argument('--extended', required=False, action='store_true',
                        help='Output string from extended metadata')
    parser.add_argument('--json', required=False, action='store_true',
                        help='Output json from metadata with timestamp of each word')
    parser.add_argument('--candidate_transcripts', type=int, default=3,
                        help='Number of candidate transcripts to include in JSON output')
    parser.add_argument('--hot_words', type=str,
                        help='Hot-words and their boosts.')
    args = parser.parse_args()

    print('Loading model from file {}'.format(args.model), file=sys.stderr)
    model_load_start = timer()
    # sphinx-doc: python_ref_model_start
    ds = Model(args.model)
    # sphinx-doc: python_ref_model_stop
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if args.beam_width:
        ds.setBeamWidth(args.beam_width)

    desired_sample_rate = ds.sampleRate()

    if args.scorer:
        print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr)
        scorer_load_start = timer()
        ds.enableExternalScorer(args.scorer)
        scorer_load_end = timer() - scorer_load_start
        print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr)

        if args.lm_alpha and args.lm_beta:
            ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)

    if args.hot_words:
        print('Adding hot-words', file=sys.stderr)
        for word_boost in args.hot_words.split(','):
            word,boost = word_boost.split(':')
            ds.addHotWord(word,float(boost))

    fin = wave.open(args.audio, 'rb')
    fs_orig = fin.getframerate()
    if fs_orig != desired_sample_rate:
        print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs_orig, desired_sample_rate), file=sys.stderr)
        fs_new, audio = convert_samplerate(args.audio, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/fs_orig)
    fin.close()

    print('Running inference.', file=sys.stderr)
    inference_start = timer()
    # sphinx-doc: python_ref_inference_start
    if args.extended:
        print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0]))
    elif args.json:
        print(metadata_json_output(ds.sttWithMetadata(audio, args.candidate_transcripts)))
    else: 
        print(ds.stt(audio))        
        test = ds.createStream().sentencefit(audio, "ka arohia katoatia te hāhi me ōna whakapono e te hapū o ōtākou")
        [print(f"letter: {t.letter}, confidence: {t.confidence}, timestep: {t.timestep}") for t in test.tokens]
        
    # sphinx-doc: python_ref_inference_stop
    inference_end = timer() - inference_start
    print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
    def record_voice_and_predict_text(self):
        """Records the speech and predicts its text """
        #Recording the speech

        stream_file_name = 'AudioFile/speech_stream.wav'
        stream_format = pyaudio.paInt16  # Sampling size and format
        no_of_channels = 1  # Number of audio channels
        sampling_rate = 16000  # Sampling rate in Hertz
        frames_count = 1024  # Number of frames per buffer
        record_seconds = 5

        stream = pyaudio.PyAudio()

        stream_data = stream.open(format=stream_format,
                                  channels=no_of_channels,
                                  rate=sampling_rate,
                                  input=True,
                                  frames_per_buffer=frames_count)
        frames = [
            stream_data.read(frames_count)
            for i in range(0, int(sampling_rate / frames_count *
                                  record_seconds))
        ]
        stream_data.stop_stream()
        stream_data.close()
        stream.terminate()

        wave_file = wave.open(stream_file_name, 'wb')
        wave_file.setnchannels(no_of_channels)
        wave_file.setsampwidth(stream.get_sample_size(stream_format))
        wave_file.setframerate(sampling_rate)
        wave_file.writeframes(b''.join(frames))
        wave_file.close()

        try:
            self.label_info.setText('Recording completed.')
        except:
            pass

        #Text prediction Part
        alpha = 0.75
        beta = 1.85
        beam_width = 500

        # Initialize the model
        speech_model = Model(MODEL_PATH)

        # set beam width. A larger beam width value generates better results at the cost of decoding time.
        speech_model.setBeamWidth(beam_width)

        # Enable language scorer to improve the accuracy
        speech_model.enableExternalScorer(SCORER_PATH)
        # You can play with setting the model Beam Width, Scorer language model weight and word insertion weight

        # Set hyperparameters alpha and beta of the external scorer.
        # alpha: Language model weight.
        # beta: Word insertion weight
        speech_model.setScorerAlphaBeta(alpha, beta)

        # Use scipy to covert wav file into numpy array
        _, audio = wav.read(stream_file_name)
        text = speech_model.stt(audio)
        try:
            self.text_pred.setText(text)
        except:
            pass
        show_images(text)
示例#18
0
class SpeechToTextEngine:
    """ Class to perform speech-to-text transcription and related functionality """

    FORMAT = pyaudio.paInt16
    SAMPLE_RATE = 16000
    CHANNELS = 1
    BLOCKS_PER_SECOND = 50

    def __init__(self, scorer='deepspeech_model.scorer') -> None:
        """ Initializing the DeepSpeech model """

        self.model = Model(model_path=Path(__file__).parents[2].joinpath(
            'deepspeech_model.pbmm').absolute().as_posix())
        self.model.enableExternalScorer(scorer_path=Path(
            __file__).parents[2].joinpath(scorer).absolute().as_posix())
        self.vad = webrtcvad.Vad(mode=3)
        self.sample_rate = self.SAMPLE_RATE
        self.buffer_queue = queue.Queue()

    def run(self, audio) -> str:
        """ Receives the audio,  normalizes it and is sent to the model to be transcribed. Returns the result of the
        transcribe audio in string format."""

        normalized_audio = normalize_audio_input(audio)
        audio_streams = BytesIO(normalized_audio)
        with wave.Wave_read(audio_streams) as wav:
            audio_streams = np.frombuffer(wav.readframes(wav.getnframes()),
                                          np.int16)
        results = self.model.stt(audio_buffer=audio_streams)
        return results

    def run_with_metadata(self, audio) -> Metadata:
        normalized_audio = normalize_audio_input(audio)
        audio_streams = BytesIO(normalized_audio)
        with wave.Wave_read(audio_streams) as wav:
            audio_streams = np.frombuffer(wav.readframes(wav.getnframes()),
                                          np.int16)
        results = self.model.sttWithMetadata(audio_buffer=audio_streams)
        return results

    def add_hot_words(self, data) -> list:
        """ Receives data in form of hot-words and boosts, adds them to the language model and return the list of the
        added hot-words """

        all_hot_words = []
        try:
            logger.info('----------------------------------------------------')
            for hot_word in data:
                # Change all the characters of the hot-word to lower case
                word = hot_word.lower()

                # Get numeric value of the boost
                boost = float(data.get(hot_word))

                # Adding the hot-word and its boost to the language model
                self.model.addHotWord(hot_word, boost)

                # Printing on the prompt the activity
                logger.info(
                    f"`{word}` hot-word with boost `{boost}` was added.")
                all_hot_words.append(word)
            return all_hot_words
        except RuntimeError:
            return []

    def erase_hot_word(self, hot_words) -> None:
        try:
            for hot_word in hot_words:
                self.model.eraseHotWord(hot_word)
                logger.info(f"`{hot_word}` hot-word is erased.")
            logger.info('----------------------------------------------------')
        except RuntimeError:
            return

    def clear_hot_words(self) -> str:
        try:
            self.model.clearHotWords()
            return f"All hot-words were erased."
        except RuntimeError:
            return f"No more hot-words are left."

    def deep_stream(self):
        return self.model.createStream()

    def frame_generator(self, audio, sample_rate=16000, frame_duration_ms=30):
        """
        Takes the desired frame duration in milliseconds, the PCM data, and
        the sample rate. Yields Frames of the requested duration.
        """

        # audio = np.frombuffer(audio, np.int16)
        n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
        offset = 0
        timestamp = 0.0
        duration = (float(n) / sample_rate) / 2.0
        while offset + n < len(audio):
            yield Frame(audio[offset:offset + n], timestamp, duration)
            timestamp += duration
            offset += n
示例#19
0
class SpeechToTextEngine:
    """ Class to perform speech-to-text transcription and related functionality """
    def __init__(self, scorer='deepspeech_model.scorer') -> None:
        """ Initializing the DeepSpeech model """
        self.model = Model(model_path=Path(__file__).parents[2].joinpath(
            'deepspeech_model.pbmm').absolute().as_posix())
        self.model.enableExternalScorer(scorer_path=Path(
            __file__).parents[2].joinpath(scorer).absolute().as_posix())

    def run(self, audio) -> str:
        """ Receives the audio,  normalizes it and is sent to the model to be transcribed. Returns the result of the
        transcribe audio in string format."""

        normalized_audio = normalize_audio_input(audio)
        audio_streams = BytesIO(normalized_audio)
        with wave.Wave_read(audio_streams) as wav:
            audio_streams = np.frombuffer(wav.readframes(wav.getnframes()),
                                          np.int16)
        results = self.model.stt(audio_buffer=audio_streams)
        return results

    def run_with_metadata(self, audio) -> Metadata:
        normalized_audio = normalize_audio_input(audio)
        audio_streams = BytesIO(normalized_audio)
        with wave.Wave_read(audio_streams) as wav:
            audio_streams = np.frombuffer(wav.readframes(wav.getnframes()),
                                          np.int16)
        results = self.model.sttWithMetadata(audio_buffer=audio_streams)
        return results

    def add_hot_words(self, data) -> list:
        """ Receives data in form of hot-words and boosts, adds them to the language model and return the list of the
        added hot-words """

        all_hot_words = []
        try:
            logger.info('----------------------------------------------------')
            for hot_word in data:
                # Change all the characters of the hot-word to lower case
                word = hot_word.lower()

                # Get numeric value of the boost
                boost = float(data.get(hot_word))

                # Adding the hot-word and its boost to the language model
                self.model.addHotWord(hot_word, boost)

                # Printing on the prompt the activity
                logger.info(
                    f"`{word}` hot-word with boost `{boost}` was added.")
                all_hot_words.append(word)
            return all_hot_words
        except RuntimeError:
            return []

    def erase_hot_word(self, hot_words) -> None:
        try:
            for hot_word in hot_words:
                self.model.eraseHotWord(hot_word)
                logger.info(f"`{hot_word}` hot-word is erased.")
            logger.info('----------------------------------------------------')
        except RuntimeError:
            return

    def clear_hot_words(self) -> str:
        try:
            self.model.clearHotWords()
            return f"All hot-words were erased."
        except RuntimeError:
            return f"No more hot-words are left."

    def sample_rate(self):
        return self.model.sampleRate()
示例#20
0
文件: brain.py 项目: 7ae/lucia
class Brain:
    MODULE_BASE_PATH = 'lucia.tasks.'

    def __init__(self):
        self.model = None
        self.r = sr.Recognizer()
        self.nlp = spacy.load(conf.get_property('spacy')['model'])
        self.espeak = conf.get_property('espeak')

        # Load low-level Duckling model
        self.duckling = Duckling()
        self.duckling.load(
            languages=conf.get_property('duckling')['languages'])

        # Remember tasks
        self.task_memory = []

    def create_model(self):
        # Create a DeepSpeech model with model path
        self.model = Model(conf.get_property('deepspeech')['model_path'])
        # Enable decoding using an external scorer
        self.model.enableExternalScorer(
            conf.get_property('deepspeech')['scorer_path'])

    def listen(self, debug_mode=False):
        while True:
            with sr.Microphone(sample_rate=conf.get_property(
                    'speech_recognition')['audio_rate']) as source:
                # Listen for a while and adjust the energy threshold to start and stop recording voice to account for ambient noise
                self.r.adjust_for_ambient_noise(
                    source,
                    duration=conf.get_property(
                        'speech_recognition')['energy_threshold'])
                self.r.dynamic_energy_threshold = True

                if debug_mode is False:
                    print("Say something")
                    audio = self.r.listen(source)
                    # Speech to text
                    audio = np.frombuffer(audio.frame_data, np.int16)
                    text = self.model.stt(audio)
                    self.speak(text)
                else:
                    text = input()

                # Wake up on hearing the wake word
                #if any(subtext in text for subtext in conf.get_property('wake_words')):
                #  self.understand(text)
                self.understand(text)

    def speak(self, text):
        subprocess.call('espeak-ng -v {}+{}{} "{}"'.format(
            self.espeak['language'], self.espeak['gender'],
            self.espeak['pitch'], text),
                        shell=True)

    def understand(self, sentence):
        # Break paragraph into sentences
        tokenized_sentence = sent_tokenize(sentence)

        # Break sentence into words
        for sent in tokenized_sentence:
            tokenized_word = word_tokenize(sent)

            # Tag corpora with universal POS tagset
            # For tag list, read https://www.nltk.org/book/ch05.html#tab-universal-tagset
            pos_tags = nltk.pos_tag(tokenized_word, tagset='universal')

            # Divide sentence into noun phrases with regular expression
            grammar = 'NOUN: {<DET>?<ADJ>*<NOUN>}'
            cp = nltk.RegexpParser(grammar)
            # Find chunk structure
            cs = cp.parse(pos_tags)
            # B-{tag} beginning, I-{tag} inside, O-{tag} outside
            iob_tags = np.asarray(tree2conlltags(cs)).tolist()

            # Recognize named entities
            doc = self.nlp(sent)

            # Parse word into numeral, ordinal, and time
            parse = lambda ne: dict([[
                _['dim'], _['value']['value']
            ] for _ in self.duckling.parse(
                ne, dim_filter=conf.get_property('duckling')['dimensions'])])
            # [Word, character positions and entity type]. For all entity types, read https://spacy.io/api/annotation#named-entities
            ne = list([
                ent.text, ent.start_char, ent.end_char, ent.label_,
                parse(ent.text)
            ] for ent in doc.ents)

            ne_tags = [_.ent_type_ for _ in doc]
            # Merge iob tags and named entity tags
            tagged_sent = [
                list(np.append(iob_tags[i], ne_tags[i]))
                for i in range(len(iob_tags))
            ]
            tagged_sent = ''.join(str(x) for x in tagged_sent)

            self.decide(tagged_sent, ne)

    def think(self, pattern, tagged_sent):
        # Match tagged sentence against combinations of POS tags, words in any order: (?=.*\bword\b)(?=.*\bADJ\bNOUN\b).*
        r = re.compile(
            '(?=.*\\b' +
            pattern.replace('  ', '\\b.*\\b').replace(' ', '\\b)(?=.*\\b') +
            '\\b).*')
        return r.search(tagged_sent)

    def decide(self, tagged_sent, named_entity):
        for task in conf.get_property('tasks'):
            for pattern in conf.get_property('tasks')[task]:
                # If sentence matches any pattern, dynamtically create class
                if self.think(pattern, tagged_sent):
                    # Split module name and class name with dot
                    module = importlib.import_module(self.MODULE_BASE_PATH +
                                                     task.rsplit('.', 1)[0])
                    instance = getattr(module, task.rsplit('.', 1)[1])()
                    print(instance)

                    # Search whether task_memory contains the same class instance
                    _run = False
                    for mem in self.task_memory:
                        if type(instance) == type(mem):
                            mem.run(self, tagged_sent, named_entity)
                            _run = True
                            break
                    if not _run:
                        # If not exists, store new class instance in task_memory
                        self.task_memory = [
                            instance.run(self, tagged_sent, named_entity)
                        ] + self.task_memory

                    break
示例#21
0
    clip = converted_audio[annotation['start']:annotation['end']]
    clip.export(annotation['clip'], format = 'wav')

converted_audio_file.close()

f.write(f'\nAudio cut succesfully.\n')

print("PROGRESS: 0.7 Starting STT with DeepSpeech", flush = True)
temp_dir = tempfile.TemporaryDirectory()

# Model path has to be taken from ELAN
ds = Model(params['model'])

if params['language_model']:

    ds.enableExternalScorer(params['language_model'])

f.write("\n\nLoaded DeepSpeech model.\n\n")

for annotation in annotations:

    fin = wave.open(annotation['clip'].name, 'rb')
    audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
    annotation['value'] = (ds.stt(audio))


# Then open 'output_tier' for writing, and return all of the new phoneme
# strings produced by Persephone as the contents of <span> elements (see
# below).
print("PROGRESS: 0.95 Preparing output tier", flush = True)
示例#22
0
def main():
    parser = argparse.ArgumentParser(
        description='Running DeepSpeech inference.')
    parser.add_argument('--model',
                        required=True,
                        help='Path to the model (protocol buffer binary file)')
    parser.add_argument('--scorer',
                        required=False,
                        help='Path to the external scorer file')
    parser.add_argument(
        '--prediction_in',
        required=True,
        help='Path to the directory with sound files (mp3/ogg/wav) to analyze')
    parser.add_argument(
        '--prediction_out',
        required=True,
        help='Path to the directory for moving the processed sound files to')
    parser.add_argument(
        '--prediction_tmp',
        required=False,
        help=
        'Path to the temp directory for storing the predictions initially before moving them to "--prediction_out"'
    )
    parser.add_argument(
        '--continuous',
        action='store_true',
        help='Whether to continuously load test images and perform prediction',
        required=False,
        default=False)
    parser.add_argument(
        '--delete_input',
        action='store_true',
        help=
        'Whether to delete the input files rather than move them to "--prediction_out" directory',
        required=False,
        default=False)
    parser.add_argument('--beam_width',
                        type=int,
                        help='Beam width for the CTC decoder')
    parser.add_argument(
        '--lm_alpha',
        type=float,
        help=
        'Language model weight (lm_alpha). If not specified, use default from the scorer package.'
    )
    parser.add_argument(
        '--lm_beta',
        type=float,
        help=
        'Word insertion bonus (lm_beta). If not specified, use default from the scorer package.'
    )
    parser.add_argument(
        '--json',
        required=False,
        action='store_true',
        help='Output json from metadata with timestamp of each word')
    parser.add_argument(
        '--candidate_transcripts',
        type=int,
        default=3,
        help='Number of candidate transcripts to include in JSON output')
    parser.add_argument(
        '--normalize',
        required=False,
        action='store_true',
        help='Whether to apply standard amplitude normalization')
    parsed = parser.parse_args()

    print('Loading model from file {}'.format(parsed.model))
    ds = Model(parsed.model)
    if parsed.beam_width:
        ds.setBeamWidth(parsed.beam_width)

    if parsed.scorer:
        print('Loading scorer from file {}'.format(parsed.scorer))
        ds.enableExternalScorer(parsed.scorer)
        if parsed.lm_alpha and parsed.lm_beta:
            ds.setScorerAlphaBeta(parsed.lm_alpha, parsed.lm_beta)

    process(model=ds,
            prediction_in=parsed.prediction_in,
            prediction_out=parsed.prediction_out,
            prediction_tmp=parsed.prediction_tmp,
            continuous=parsed.continuous,
            delete_input=parsed.delete_input,
            json=parsed.json,
            candidate_transcripts=parsed.candidate_transcripts,
            normalize=parsed.normalize)
示例#23
0
def app_sst(model_path: str, lm_path: str, lm_alpha: float, lm_beta: float,
            beam: int):
    webrtc_ctx = webrtc_streamer(
        key="speech-to-text",
        mode=WebRtcMode.SENDONLY,
        audio_receiver_size=1024,
        rtc_configuration={
            "iceServers": [{
                "urls": ["stun:stun.l.google.com:19302"]
            }]
        },
        media_stream_constraints={
            "video": False,
            "audio": True
        },
    )

    status_indicator = st.empty()

    if not webrtc_ctx.state.playing:
        return

    status_indicator.write("Loading...")
    text_output = st.empty()
    stream = None

    while True:
        if webrtc_ctx.audio_receiver:
            if stream is None:
                from deepspeech import Model

                model = Model(model_path)
                model.enableExternalScorer(lm_path)
                model.setScorerAlphaBeta(lm_alpha, lm_beta)
                model.setBeamWidth(beam)

                stream = model.createStream()

                status_indicator.write("Model loaded.")

            sound_chunk = pydub.AudioSegment.empty()
            try:
                audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=1)
            except queue.Empty:
                time.sleep(0.1)
                status_indicator.write("No frame arrived.")
                continue

            status_indicator.write("Running. Say something!")

            for audio_frame in audio_frames:
                sound = pydub.AudioSegment(
                    data=audio_frame.to_ndarray().tobytes(),
                    sample_width=audio_frame.format.bytes,
                    frame_rate=audio_frame.sample_rate,
                    channels=len(audio_frame.layout.channels),
                )
                sound_chunk += sound

            if len(sound_chunk) > 0:
                sound_chunk = sound_chunk.set_channels(1).set_frame_rate(
                    model.sampleRate())
                buffer = np.array(sound_chunk.get_array_of_samples())
                stream.feedAudioContent(buffer)
                text = stream.intermediateDecode()
                text_output.markdown(f"**Text:** {text}")
        else:
            status_indicator.write("AudioReciver is not set. Abort.")
            break
示例#24
0
from deepspeech import Model
import gradio as gr
import numpy as np

model_file_path = "deepspeech-0.8.2-models.pbmm"
lm_file_path = "deepspeech-0.8.2-models.scorer"
beam_width = 100
lm_alpha = 0.93
lm_beta = 1.18

model = Model(model_file_path)
model.enableExternalScorer(lm_file_path)
model.setScorerAlphaBeta(lm_alpha, lm_beta)
model.setBeamWidth(beam_width)


def reformat_freq(sr, y):
    if sr not in (
            48000,
            16000,
    ):  # Deepspeech only supports 16k, (we convert 48k -> 16k)
        raise ValueError("Unsupported rate", sr)
    if sr == 48000:
        y = (((y / max(np.max(y), 1)) * 32767).reshape(
            (-1, 3)).mean(axis=1).astype("int16"))
        sr = 16000
    return sr, y


def transcribe(speech, stream):
    _, y = reformat_freq(*speech)
示例#25
0
class SocketIOInput(InputChannel):
    """A socket.io input channel."""
    @classmethod
    def name(cls):
        return "socketio"

    @classmethod
    def from_credentials(cls, credentials):
        credentials = credentials or {}
        return cls(
            credentials.get("user_message_evt", "user_uttered"),
            credentials.get("bot_message_evt", "bot_uttered"),
            credentials.get("namespace"),
            credentials.get("session_persistence", False),
            credentials.get("socketio_path", "/socket.io"),
        )

    def __init__(self,
                 user_message_evt: Text = "user_uttered",
                 bot_message_evt: Text = "bot_uttered",
                 namespace: Optional[Text] = None,
                 session_persistence: bool = False,
                 socketio_path: Optional[Text] = '/socket.io'):
        self.bot_message_evt = bot_message_evt
        self.session_persistence = session_persistence
        self.user_message_evt = user_message_evt
        self.namespace = namespace
        self.socketio_path = socketio_path
        self.speech_to_text_model = Model('stt/deepspeech-0.9.1-models.pbmm')
        self.speech_to_text_model.enableExternalScorer(
            'stt/deepspeech-0.9.1-models.scorer')

    def blueprint(self, on_new_message):
        sio = AsyncServer(async_mode="sanic",
                          logger=True,
                          cors_allowed_origins='*')
        socketio_webhook = SocketBlueprint(sio, self.socketio_path,
                                           "socketio_webhook", __name__)

        @socketio_webhook.route("/", methods=['GET'])
        async def health(request):
            return response.json({"status": "ok"})

        @sio.on('connect')
        async def connect(sid, environ):
            print("User {} connected to socketIO endpoint.".format(sid))

        @sio.on('disconnect')
        async def disconnect(sid):
            print("User {} disconnected from socketIO endpoint."
                  "".format(sid))

        @sio.on('session_request')
        async def session_request(sid, data):
            print('Session request received')

            if data is None:
                data = {}
            if 'session_id' not in data or data['session_id'] is None:
                data['session_id'] = uuid.uuid4().hex
            await sio.emit("session_confirm", data['session_id'], room=sid)
            print("User {} connected to socketIO endpoint." "".format(sid))

        @sio.on('user_uttered')
        async def handle_message(sid, data):
            print('User uttered')
            output_channel = SocketIOOutput(sio, sid, self.bot_message_evt,
                                            data['message'])
            if data['message'] == "/get_started":
                message = data['message']
            else:
                ##receive audio
                received_file = 'output_' + sid + '.wav'

                request.urlretrieve(data['message'], received_file)

                # fs, audio = wav.read("output_{0}.wav".format(sid))
                input_audio_file = wave.open("output_{0}.wav".format(sid),
                                             'rb')
                converted_audio_to_bytes = numpy.frombuffer(
                    input_audio_file.readframes(input_audio_file.getnframes()),
                    numpy.int16)
                input_audio_file.close()
                message = self.speech_to_text_model.stt(
                    converted_audio_to_bytes)

                insert_user_message_to_database(message, sid)

                await sio.emit(self.user_message_evt, {"text": message},
                               room=sid)

            message_rasa = UserMessage(message,
                                       output_channel,
                                       sid,
                                       input_channel=self.name())
            await on_new_message(message_rasa)

        return socketio_webhook
示例#26
0
        devs = [
            a.get_device_info_by_index(i) for i in range(a.get_device_count())
        ]
        for i, dev in enumerate(devs):
            if "HD Pro Webcam" in dev["name"]:
                device_index = i
                print("connecting to:\n{}\n".format(dev))
                break

    # load in DS model
    model_path = os.path.join(os.getcwd(), "models")
    pb = glob.glob(model_path + "/*.pbmm")[0]
    scorer = glob.glob(model_path + "/*.scorer")[0]
    # load them in
    ds = Model(pb)
    ds.enableExternalScorer(scorer)
    model = ds
    desired_sample_rate = ds.sampleRate()

    print("TEST_MODE: {}".format(TEST_MODE))

    if not TEST_MODE:
        # connect to mqtt server
        client = mqtt.Client(client_id="",
                             clean_session=True,
                             userdata=None,
                             transport="tcp")

        client.on_connect = on_connect
        client.connect("127.0.0.1", 1883, 60)
        # start another thread to react to incoming messages
示例#27
0
def create_app(args):
    logging.basicConfig(level=logging.DEBUG)
    sys.stdout = LoggerWriter(logging.debug)
    sys.stderr = LoggerWriter(logging.warning)
    if not args.offline:
        from app.init import boot
        boot()

    from app.language import languages
    app = Flask(__name__)

    project_directory = args.project_directory
    if not os.path.exists(project_directory):
        os.makedirs(project_directory)

    # For faster access
    language_map = {}
    for l in languages:
        language_map[l.code] = l.name

    if args.debug:
        app.config['TEMPLATES_AUTO_RELOAD'] = True
    app.config['MAX_CONTENT_LENGTH'] = 64 * 1024 * 1024
    # Map userdefined frontend languages to argos language object.
    if args.frontend_language_source == "auto":
        frontend_argos_language_source = type('obj', (object, ), {
            'code': 'auto',
            'name': 'Auto Detect'
        })
    else:
        frontend_argos_language_source = next(
            iter([
                l for l in languages if l.code == args.frontend_language_source
            ]), None)

    frontend_argos_language_target = next(
        iter([l for l in languages
              if l.code == args.frontend_language_target]), None)

    # Raise AttributeError to prevent app startup if user input is not valid.
    if frontend_argos_language_source is None:
        raise AttributeError(
            f"{args.frontend_language_source} as frontend source language is not supported."
        )
    if frontend_argos_language_target is None:
        raise AttributeError(
            f"{args.frontend_language_target} as frontend target language is not supported."
        )

    if args.req_limit > 0 or args.api_keys:
        from flask_limiter import Limiter
        limiter = Limiter(app,
                          key_func=get_remote_address,
                          default_limits=get_routes_limits(
                              args.req_limit,
                              Database() if args.api_keys else None))
    model_load_start = timer()
    ds = Model(os.path.join(home_dir, "models",
                            "deepspeech-0.9.3-models.pbmm"))
    ds.enableExternalScorer(
        os.path.join(home_dir, "models", "deepspeech-0.9.3-models.scorer"))
    model_load_end = timer() - model_load_start
    logging.info('Loaded model in {:.3}s.'.format(model_load_end))
    desired_sample_rate = ds.sampleRate()
    logging.info('Model optimized for a sample rate of ' +
                 str(desired_sample_rate))
    uuid4hex = re.compile(
        '[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\Z', re.I)

    @app.errorhandler(400)
    def invalid_api(e):
        return jsonify({"error": str(e.description)}), 400

    @app.errorhandler(500)
    def server_error(e):
        return jsonify({"error": str(e.description)}), 500

    @app.errorhandler(429)
    def slow_down_error(e):
        return jsonify({"error": "Slowdown: " + str(e.description)}), 429

    @app.route("/")
    @limiter.exempt
    def index():
        return render_template('index.html',
                               gaId=args.ga_id,
                               frontendTimeout=args.frontend_timeout,
                               offline=args.offline,
                               api_keys=args.api_keys,
                               web_version=os.environ.get('LT_WEB')
                               is not None)

    @app.route("/projects")
    @limiter.exempt
    def projects():
        return render_template('projects.html',
                               gaId=args.ga_id,
                               frontendTimeout=args.frontend_timeout,
                               offline=args.offline,
                               api_keys=args.api_keys,
                               projects=loadAllProjects(),
                               web_version=os.environ.get('LT_WEB')
                               is not None)

    @app.route("/project/<id>")
    @limiter.exempt
    def project(id):
        if not uuid4hex.match(id):
            logging.error("Invalid project id")
            return redirect("/projects")
        return render_template('project.html',
                               gaId=args.ga_id,
                               frontendTimeout=args.frontend_timeout,
                               offline=args.offline,
                               api_keys=args.api_keys,
                               project=loadProjectDetails(id),
                               web_version=os.environ.get('LT_WEB')
                               is not None)

    @app.route("/project/<id>/delete")
    @limiter.exempt
    def projectDelete(id):
        delete_project(id)
        return redirect("/projects")

    @app.route("/project/<id>/transcription")
    @limiter.exempt
    def projectTranscribe(id):
        if not uuid4hex.match(id):
            flash("Invalid project id")
            return redirect("/projects")
        logging.info("Starting the transcription job for project ID " + id)
        cmd = [
            sys.executable,
            os.path.join(home_dir, 'scripts', 'batch.py', "--target-dir",
                         os.path.join(project_directory, id))
        ]
        subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        return redirect("/project/" + id)

    @app.route("/project/<id>/download/<file>")
    def download(id, file):
        # todo validate the file part

        metadata = loadProjectDetails(id)
        if metadata is None:
            logging.info("Unable to find metdata for project ID: " + id)
            return redirect("/projects")
        return send_from_directory(directory=metadata['project_dir'],
                                   filename=file,
                                   as_attachment=True)

    @app.route("/create-project")
    @limiter.exempt
    def createProject():
        return render_template('create-project.html',
                               gaId=args.ga_id,
                               frontendTimeout=args.frontend_timeout,
                               offline=args.offline,
                               api_keys=args.api_keys,
                               web_version=os.environ.get('LT_WEB')
                               is not None)

    def allowed_file(filename):
        return '.' in filename and filename.rsplit(
            '.', 1)[1].lower() in ALLOWED_EXTENSIONS

    @app.route('/new-project-upload', methods=['GET', 'POST'])
    def uploadProject():
        if request.method == 'POST':
            # check if the post request has the file part
            if 'file' not in request.files:
                return redirect(request.url)
            file = request.files['file']
            # if user does not select file, browser also
            # submit an empty part without filename
            if file.filename == '':
                return redirect(request.url)
            if file and allowed_file(file.filename):
                project_id = str(uuid.uuid4())
                if not os.path.exists(
                        os.path.join(project_directory, project_id)):
                    os.makedirs(os.path.join(project_directory, project_id))
                fileending = file.filename.rsplit('.', 1)[1].lower()
                file.save(
                    os.path.join(project_directory, project_id,
                                 "rawMedia." + fileending))
                # TODO store original file name
                metadata = createMetadata(project_id, request.form['name'],
                                          fileending)
                with open(
                        os.path.join(project_directory, project_id,
                                     "metadata.json"), 'w') as f:
                    json.dump(metadata, f)

                return redirect("./project/" + project_id)

    @timeit
    def createMetadata(project_id, name, ending):
        metadata = {"name": name, "fileEnding": ending}
        in_filename = os.path.join(project_directory, project_id,
                                   "rawMedia." + ending)
        probe = ffmpeg.probe(in_filename)
        video_stream = next((stream for stream in probe['streams']
                             if stream['codec_type'] == 'video'), None)
        logging.debug(str(video_stream))
        metadata['width'] = int(video_stream['width'])
        metadata['height'] = int(video_stream['height'])
        metadata['durationSeconds'] = float(video_stream['duration'])
        (ffmpeg.input(in_filename,
                      ss=3).filter('scale', 512, -1).output(os.path.join(
                          project_directory, project_id, "thumbnail.png"),
                                                            vframes=1).run())
        return metadata

    def delete_project(project_id):
        logging.info("Deleting a project with ID: " + project_id)
        # TODO make sure tha ID is a valid ID an not just some bad path
        shutil.rmtree(os.path.join(project_directory, project_id))

    @app.route("/languages", methods=['GET', 'POST'])
    @limiter.exempt
    def langs():
        """
        Retrieve list of supported languages
        ---
        tags:
          - translate
        responses:
          200:
            description: List of languages
            schema:
              id: languages
              type: array
              items:
                type: object
                properties:
                  code:
                    type: string
                    description: Language code
                  name:
                    type: string
                    description: Human-readable language name (in English)
          429:
            description: Slow down
            schema:
              id: error-slow-down
              type: object
              properties:
                error:
                  type: string
                  description: Reason for slow down
        """
        return jsonify([{'code': l.code, 'name': l.name} for l in languages])

    # Add cors
    @app.after_request
    def after_request(response):
        response.headers.add('Access-Control-Allow-Origin', '*')
        response.headers.add('Access-Control-Allow-Headers',
                             "Authorization, Content-Type")
        response.headers.add('Access-Control-Expose-Headers', "Authorization")
        response.headers.add('Access-Control-Allow-Methods', "GET, POST")
        response.headers.add('Access-Control-Allow-Credentials', "true")
        response.headers.add('Access-Control-Max-Age', 60 * 60 * 24 * 20)
        return response

    @app.route("/project", methods=['GET'])
    def list_projects():
        """
        List available projects
        ---
        tags:
          - list
        """
        return jsonify({"projects": loadAllProjects()})

    def loadAllProjects():
        output = []
        for project_id in os.listdir(project_directory):
            project_details = loadProjectDetails(project_id)
            if project_details is not None:
                output.append(project_details)
        return output

    def loadProjectDetails(project_id):
        metadata_path = os.path.join(project_directory, project_id,
                                     "metadata.json")
        if not os.path.exists(metadata_path):
            return None
        metadata = json.loads(Path(metadata_path).read_text())
        metadata["id"] = project_id
        metadata['project_dir'] = os.path.join(project_directory, project_id)
        # TODO rely on this data for everything
        metadata['subtitles'] = []
        for file in os.listdir(metadata['project_dir']):
            if file.endswith(".srt"):
                metadata['subtitles'].append(file)
        if os.path.exists(os.path.join(project_directory, "subtitles.zip")):
            metadata['subtitles'].insert(0, 'subtitles.zip')
        metadata['inputVideo'] = "rawMedia." + metadata['fileEnding']

        metadata['audio'] = "audio.wav"
        return metadata

    @app.route("/translate", methods=['POST'])
    def translate():
        """
        Translate text from a language to another
        ---
        tags:
          - translate
        parameters:
          - in: formData
            name: q
            schema:
              oneOf:
                - type: string
                  example: Hello world!
                - type: array
                  example: ['Hello world!']
            required: true
            description: Text(s) to translate
          - in: formData
            name: source
            schema:
              type: string
              example: en
            required: true
            description: Source language code
          - in: formData
            name: target
            schema:
              type: string
              example: es
            required: true
            description: Target language code
          - in: formData
            name: api_key
            schema:
              type: string
              example: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
            required: false
            description: API key
        responses:
          200:
            description: Translated text
            schema:
              id: translate
              type: object
              properties:
                translatedText:
                  oneOf:
                    - type: string
                    - type: array
                  description: Translated text(s)
          400:
            description: Invalid request
            schema:
              id: error-response
              type: object
              properties:
                error:
                  type: string
                  description: Error message
          500:
            description: Translation error
            schema:
              id: error-response
              type: object
              properties:
                error:
                  type: string
                  description: Error message
          429:
            description: Slow down
            schema:
              id: error-slow-down
              type: object
              properties:
                error:
                  type: string
                  description: Reason for slow down
        """

        if request.is_json:
            json = request.get_json()
            q = json.get('q')
            source_lang = json.get('source')
            target_lang = json.get('target')
        else:
            q = request.values.get("q")
            source_lang = request.values.get("source")
            target_lang = request.values.get("target")

        if not q:
            abort(400, description="Invalid request: missing q parameter")
        if not source_lang:
            abort(400, description="Invalid request: missing source parameter")
        if not target_lang:
            abort(400, description="Invalid request: missing target parameter")

        batch = isinstance(q, list)

        if batch and args.batch_limit != -1:
            batch_size = len(q)
            if args.batch_limit < batch_size:
                abort(400,
                      description=
                      "Invalid request: Request (%d) exceeds text limit (%d)" %
                      (batch_size, args.batch_limit))

        if args.char_limit != -1:
            if batch:
                chars = sum([len(text) for text in q])
            else:
                chars = len(q)

            if args.char_limit < chars:
                abort(
                    400,
                    description=
                    "Invalid request: Request (%d) exceeds character limit (%d)"
                    % (chars, args.char_limit))

        if source_lang == 'auto':
            candidate_langs = list(
                filter(lambda l: l.lang in language_map, detect_langs(q)))

            if len(candidate_langs) > 0:
                candidate_langs.sort(key=lambda l: l.prob, reverse=True)

                if args.debug:
                    print(candidate_langs)

                source_lang = next(
                    iter([
                        l.code for l in languages
                        if l.code == candidate_langs[0].lang
                    ]), None)
                if not source_lang:
                    source_lang = 'en'
            else:
                source_lang = 'en'

            if args.debug:
                print("Auto detected: %s" % source_lang)

        src_lang = next(iter([l for l in languages if l.code == source_lang]),
                        None)
        tgt_lang = next(iter([l for l in languages if l.code == target_lang]),
                        None)

        if src_lang is None:
            abort(400, description="%s is not supported" % source_lang)
        if tgt_lang is None:
            abort(400, description="%s is not supported" % target_lang)

        translator = src_lang.get_translation(tgt_lang)

        try:
            if batch:
                return jsonify({
                    "translatedText":
                    [translator.translate(text) for text in q]
                })
            else:
                return jsonify({"translatedText": translator.translate(q)})
        except Exception as e:
            abort(500, description="Cannot translate text: %s" % str(e))

    @app.route("/detect", methods=['POST'])
    def detect():
        """
        Detect the language of a single text
        ---
        tags:
          - translate
        parameters:
          - in: formData
            name: q
            schema:
              type: string
              example: Hello world!
            required: true
            description: Text to detect
          - in: formData
            name: api_key
            schema:
              type: string
              example: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
            required: false
            description: API key
        responses:
          200:
            description: Detections
            schema:
              id: detections
              type: array
              items:
                type: object
                properties:
                  confidence:
                    type: number
                    format: float
                    minimum: 0
                    maximum: 1
                    description: Confidence value
                    example: 0.6
                  language:
                    type: string
                    description: Language code
                    example: en
          400:
            description: Invalid request
            schema:
              id: error-response
              type: object
              properties:
                error:
                  type: string
                  description: Error message
          500:
            description: Detection error
            schema:
              id: error-response
              type: object
              properties:
                error:
                  type: string
                  description: Error message
          429:
            description: Slow down
            schema:
              id: error-slow-down
              type: object
              properties:
                error:
                  type: string
                  description: Reason for slow down
        """
        if request.is_json:
            json = request.get_json()
            q = json.get('q')
        else:
            q = request.values.get("q")

        if not q:
            abort(400, description="Invalid request: missing q parameter")

        candidate_langs = list(
            filter(lambda l: l.lang in language_map, detect_langs(q)))
        candidate_langs.sort(key=lambda l: l.prob, reverse=True)
        return jsonify([{
            'confidence': l.prob,
            'language': l.lang
        } for l in candidate_langs])

    @app.route("/frontend/settings")
    @limiter.exempt
    def frontend_settings():
        """
        Retrieve frontend specific settings
        ---
        tags:
          - frontend
        responses:
          200:
            description: frontend settings
            schema:
              id: frontend-settings
              type: object
              properties:
                charLimit:
                  type: integer
                  description: Character input limit for this language (-1 indicates no limit)
                frontendTimeout:
                  type: integer
                  description: Frontend translation timeout
                language:
                  type: object
                  properties:
                    source:
                      type: object
                      properties:
                        code:
                          type: string
                          description: Language code
                        name:
                          type: string
                          description: Human-readable language name (in English)
                    target:
                      type: object
                      properties:
                        code:
                          type: string
                          description: Language code
                        name:
                          type: string
                          description: Human-readable language name (in English)
        """
        return jsonify({
            'charLimit': args.char_limit,
            'frontendTimeout': args.frontend_timeout,
            'language': {
                'source': {
                    'code': frontend_argos_language_source.code,
                    'name': frontend_argos_language_source.name
                },
                'target': {
                    'code': frontend_argos_language_target.code,
                    'name': frontend_argos_language_target.name
                }
            }
        })

    swag = swagger(app)
    swag['info']['version'] = "1.2"
    swag['info']['title'] = "LibreTranslate"

    @app.route("/spec")
    @limiter.exempt
    def spec():
        return jsonify(swag)

    SWAGGER_URL = '/docs'  # URL for exposing Swagger UI (without trailing '/')
    API_URL = '/spec'

    # Call factory function to create our blueprint
    swaggerui_blueprint = get_swaggerui_blueprint(SWAGGER_URL, API_URL)

    app.register_blueprint(swaggerui_blueprint)

    return app
示例#28
0
    print(" Recording complete.")
    audio_data = (np.frombuffer(b''.join(frames), dtype=np.int16) / 32767)
    bg_data = (np.frombuffer(b''.join(frames_bg), dtype=np.int16) / 32767)
    # denoised_data = removeNoise(audio_data, bg_data)#.astype('float32')
    return audio_data  #denoised_data


#######Deepspeech Voice-To-Text Parameters########
DS_FOLDER = 'deepspeech_data'
if not os.path.exists(DS_FOLDER):
    os.mkdir(DS_FOLDER)
DS_model_file_path = 'deepspeech_data/deepspeech-0.7.4-models.pbmm'
beam_width = 500
DS_model = Model(DS_model_file_path)
DS_model.setBeamWidth(beam_width)
DS_model.enableExternalScorer('deepspeech_data/deepspeech-0.7.4-models.scorer')


def get_text(data, model=DS_model):
    """
    Transcribe text from audio.

    data: audio data as in array read from librosa with sampling rate 16000.
    model: Deepspeech ASR model.
    """
    #     y , s = librosa.load(fpath, sr=16000)
    y = (data * 32767).astype('int16')
    text = model.stt(y)
    return text

def main():
    global line_count
    print("AutoSub v0.1\n")

    parser = argparse.ArgumentParser(description="AutoSub v0.1")
    parser.add_argument('--model', required=True, help='DeepSpeech model file')
    parser.add_argument('--scorer', help='DeepSpeech scorer file')
    parser.add_argument('--file', required=True, help='Input video file')
    args = parser.parse_args()

    ds_model = args.model
    if not ds_model.endswith(".pbmm"):
        print("Invalid model file. Exiting\n")
        exit(1)

    # Load DeepSpeech model
    ds = Model(ds_model)

    if args.scorer:
        ds_scorer = args.scorer
        if not ds_scorer.endswith(".scorer"):
            print(
                "Invalid scorer file. Running inference using only model file\n"
            )
        else:
            ds.enableExternalScorer(ds_scorer)

    input_file = args.file
    print("\nInput file:", input_file)

    base_directory = os.getcwd()
    output_directory = os.path.join(base_directory, "output")
    audio_directory = os.path.join(base_directory, "audio")
    video_file_name = input_file.split(os.sep)[-1].split(".")[0]
    audio_file_name = os.path.join(audio_directory, video_file_name + ".wav")
    srt_file_name = os.path.join(output_directory, video_file_name + ".srt")

    # Extract audio from input video file
    extract_audio(input_file, audio_file_name)

    print("Splitting on silent parts in audio file")
    silenceRemoval(audio_file_name)

    # Output SRT file
    file_handle = open(srt_file_name, "a+")

    print("\nRunning inference:")

    for file in tqdm(sort_alphanumeric(os.listdir(audio_directory))):
        audio_segment_path = os.path.join(audio_directory, file)

        # Dont run inference on the original audio file
        if audio_segment_path.split(os.sep)[-1] != audio_file_name.split(
                os.sep)[-1]:
            ds_process_audio(ds, audio_segment_path, file_handle)

    print("\nSRT file saved to", srt_file_name)
    file_handle.close()

    # Clean audio/ directory
    shutil.rmtree(audio_directory)
    os.mkdir(audio_directory)
示例#30
0
class DeepSpeech():
    def __init__(self, model_path, scorer_path, result_json_path,
                 result_txt_path, candidate_transcripts=3, beam_width=None):

        # Path to the Speech-To-Text model
        self.MODEL_PATH = model_path
        # Path to the scorer language mode
        self.SCORER_PATH = scorer_path
        # The number of times to trascript
        self.CANDIDATE_TRANSCRIPTS = candidate_transcripts

        self.result_json_path = result_json_path
        self.result_txt_path = result_txt_path

        self.beam_width = beam_width

        self._setup()

    def _setup(self):
        self.ds = Model(self.MODEL_PATH)  # Declare the model obj
        # Set desired sample rate for STT model.
        self.sample_rate = '16000'

        if self.beam_width:
            self.ds.setBeamWidth(self.beam_width)

        if self.SCORER_PATH:
            self.ds.enableExternalScorer(self.SCORER_PATH)

    def convert_samplerate(self, audio_path, desired_sample_rate):
        sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {}\
                   --encoding signed-integer --endian little\
                   --compression 0.0 --no-dither - '\
        .format(quote(audio_path), desired_sample_rate)
        try:
            output = subprocess.check_output(
                shlex.split(sox_cmd), stderr=subprocess.PIPE)
        except subprocess.CalledProcessError as e:
            raise RuntimeError(
                'SoX returned non-zero status: {}'.format(e.stderr))
        except OSError as e:
            raise OSError(e.errno,
                          'SoX not found, use {}hz files or install it: {}'
                          .format(desired_sample_rate, e.strerror))

        return desired_sample_rate, np.frombuffer(output, np.int16)

    def words_from_candidate_transcript(self, metadata):
        word = ""
        word_list = []
        word_start_time = 0
        # Loop through each character
        for i, token in enumerate(metadata.tokens):
            # Append character to word if it's not a space
            if token.text != " ":
                if len(word) == 0:
                    # Log the start time of the new word
                    word_start_time = token.start_time

                word = word + token.text
            # Word boundary is either a space or the last character in the arr
            if token.text == " " or i == len(metadata.tokens) - 1:
                word_duration = token.start_time - word_start_time

                if word_duration < 0:
                    word_duration = 0

                each_word = dict()
                each_word["word"] = word
                each_word["start_time "] = round(word_start_time, 4)
                each_word["duration"] = round(word_duration, 4)

                word_list.append(each_word)
                # Reset
                word = ""
                word_start_time = 0

        return word_list

    def metadata_json_output(self, metadata):
        json_result = dict()
        json_result["transcripts"] = [{
            "confidence": transcript.confidence,
            "words": self.words_from_candidate_transcript(transcript),
        } for transcript in metadata.transcripts]
        return json.dumps(json_result, indent=4)

    def take_audio_info(self):
        probe = ffmpeg.probe(self.FILE_PATH)
        self.audio_info = next(
            (stream for stream in probe['streams']
             if stream['codec_type'] == 'audio'), None)
        print(self.audio_info)
        return self.audio_info

    def take_audio(self):
        out, err = (
            ffmpeg
            .input(self.FILE_PATH)
            .output('-', format='s16le',
                    acodec='pcm_s16le', ac=1, ar=self.sample_rate)
            .run(capture_stdout=True, capture_stderr=True)
        )
        self.audio = np.frombuffer(out, np.int16)
        return self.audio

    def speech2text(self):
        metadata = self.ds.sttWithMetadata(
            self.audio, self.CANDIDATE_TRANSCRIPTS)
        json_result = self.metadata_json_output(metadata)

        with open(self.result_json_path, 'w') as outfile:
            outfile.write(json_result)

        dict_result = json.loads(json_result)
        word_list = [item["word"]
                     for item in dict_result["transcripts"][0]["words"]]

        sentence = " ".join(word_list)
        self.export2textfile(sentence)
        return sentence

    def export2textfile(self, sentence):
        txt_file = open(self.result_txt_path, "w")
        txt_file.writelines(sentence)
        txt_file.close()

    def set_file(self, filepath):
        self.FILE_PATH = filepath