Python SpeechToText示例，speech_to_text.SpeechToText Python示例

示例#1

0

显示文件

文件： main.py 项目： YangZhao71/FreestylePi

    def __init__(self, detect_model="data/andrew2.net",
                       lyrics_model="data/keras_model_1200.h5",
                       lyrics_chars="data/chars.pkl"):
        # microphone
        self.mic = ResumableMicrophoneStream(SAMPLE_RATE, CHUNK_SIZE)

        # wake word detector
        self.detector = TriggerDetector(detect_model)

        # speech and language services
        self.speech_client = SpeechToText()
        self.luis = LangUnderstand()
        self.tts = TextToSpeech()

        # lyrics generator model
        self.lyrics_gen = LyricsGenerator(lyrics_model, lyrics_chars)

        self.pred_queue = DetectQueue(maxlen=5)
        self.is_wakeup = False

        # pytft display
        self.tft = TFTDisplay()
        self.tft_queue = queue.Queue()
        self.tft_thread = threading.Thread(target=self.tft_manage, args=())
        self.tft_thread.daemon = True
        self.tft_thread.start()

        self.notify("hi_there")

示例#2

0

显示文件

def get_text_from_audio():
    try:
        file = request.files.get('file')
        if file and file.filename.rsplit('.', 1)[1] in ('flac', 'wav'):
            speech_to_text = SpeechToText()
            filename = secure_filename(file.filename)
            file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
            text = speech_to_text.recognize(
                audio_file=os.path.join(app.config['UPLOAD_FOLDER'], filename))
            return jsonify({'text': text})
        return jsonify(
            {'message':
             'File format is not supported. Please use flac, wav'}), 400
    except Exception as e:
        return jsonify('Failed to upload file'), 400

示例#3

0

显示文件

def PredictForLanguage(executor, language):
    language_path = os.path.join(path, language)
    if (not app_utils.CheckIfPathExists(language_path)):
        return
    files = app_utils.listdir_fullpath(language_path)
    with open(os.path.join(out_path, "vortex_" + language + ".csv"), "w") as f:
        for file in files:
            # Convert Speech to Text
            print("Converting Speech to Text for ", file)
            text = SpeechToText(executor, file, language)

            # Use this to predict results
            app_utils.DebugCommand("Text Recognised is ", text)
            translated = text
            translated = TranslateToEnglish(text,
                                            language,
                                            translator="Seq2Seq")
            app_utils.DebugCommand("Translated to English Is ", translated)

            if ModelColdStarter.isAlive():
                ModelColdStarter.join()
            prediction = PredictResults(translated)

            print(prediction)
            f.write(file + ", " + prediction[0] + "\n")

示例#4

0

显示文件

文件： pocket_advisor_gui.py 项目： DarBeck/pocket_advisor

 def send_voice(self):
     try:
         result = SpeechToText().run(self.auth_token)
         output = result[0]
         self.auth_token = result[1]
         print(self.auth_token)
         self.browser.append("You: " + output)
         print(output)
         self.receive_message(output)
     except Exception as e:
         print(e)

示例#5

0

显示文件

 def __init__(self, config):
     """
     Initialize
     :param config: configuration
     :type config: Config
     """
     self.command_processor = CommandProcessor(
         self._command_handlers(config.command_handlers))
     self.robot = Robot(config.apiai.client_access_token,
                        config.apiai.language,
                        self.command_processor.commands)
     self.speech_to_text = SpeechToText(
         config.speechkit.key, "", config.speechkit.recognition.language)
     self.text_to_speech = TextToSpeech(
         config.speechkit.synthesis.cache_size, config.speechkit.key,
         config.speechkit.synthesis.language,
         config.speechkit.synthesis.speaker,
         config.speechkit.synthesis.emotion,
         config.speechkit.synthesis.speed)
     self.record = SpeechCapture(config.record.silence_calculation_chunks,
                                 config.record.speech_level_coefficient,
                                 config.record.start_wait_chunks,
                                 config.record.finish_wait_chunks)

示例#6

0

显示文件

def predict(speech_recognition=False, speech_synthesis=False):
    ''' Работа с обученной моделью seq2seq.
    1. speech_recognition - включение распознавания речи с микрофона с помощью PocketSphinx
    2. speech_synthesis - включение озвучивания ответов с помощью RHVoice '''
    name_dataset = configure_file_names()

    ttt = TextToText(f_name_w2v_model=f_name_w2v_model,
                     f_name_model=f_name_model,
                     f_name_model_weights=f_name_model_weights)

    if speech_recognition:
        print('[i] Загрузка языковой модели для распознавания речи...')
        stt = SpeechToText('from_microphone', name_dataset)

    if speech_synthesis:
        print('[i] Загрузка синтезатора речи...')
        tts = TextToSpeech('anna')

    print()
    question = ''
    while (True):
        if speech_recognition:
            print('Слушаю...')
            question = stt.get()
            os.write(sys.stdout.fileno(), curses.tigetstr('cuu1'))
            print('Вы: ' + question)
        else:
            question = input('Вы: ')
        answer, lost_words = ttt.predict(question, True)
        print('\t=> %s' % answer)
        if len(lost_words) > 0:
            print('[w] Потерянные слова: ' + ', '.join(lost_words) + '\n')
        else:
            print()
        if speech_synthesis:
            tts.get(answer)

示例#7

0

显示文件

文件： adjust_transcript_timestamps.py 项目： AudiovisualMetadataPlatform/galaxy

def main():

    (stt_json, adj_json, output_json) = sys.argv[1:4]

    # Turn adjustment data into list of kept segments
    with open(adj_json, 'r') as file:
        adj_data = json.load(file)

    # Turn stt json into objects
    with open(stt_json, 'r') as file:
        stt = SpeechToText().from_json(json.load(file))

    # List of adjustments (start, end, adjustment)
    offset_adj = []
    # Last ending position for iterating through kept segments
    last_end = 0.00
    # Running tally of removed segment lengths
    current_adj = 0.00

    # For each segment that was kept, keep track of the gaps to know how much to adjust
    for kept_segment in adj_data:
        print(kept_segment + ":" + str(adj_data[kept_segment]))
        start = float(kept_segment)
        end = adj_data[kept_segment]
        # If the start of this segment is after the last end, we have a gap
        if (start >= last_end):
            # Keep track of the gap in segments
            current_adj = current_adj + (start - last_end)
            # Add it to a list of adjustments
            offset_adj.append(
                Adjustment(start - current_adj, end - current_adj,
                           current_adj))
        # Keep track of the last segment end
        last_end = end

    # For each word, find the corresponding adjustment
    for word in stt.results.words:
        adjust_word(word, offset_adj)

    # Write the resulting json
    mgm_utils.write_json_file(stt, output_json)

示例#8

0

显示文件

文件： kaldi_transcript_to_amp_transcript.py 项目： AudiovisualMetadataPlatform/galaxy

def convert(media_file, kaldi_file, kaldi_transcript_file, output_json_file):
    mgm_utils.exception_if_file_not_exist(kaldi_file)
    if not os.path.exists(kaldi_transcript_file):
        raise Exception(
            "Exception: File " + kaldi_transcript_file +
            " doesn't exist, the previous command generating it must have failed."
        )
    results = SpeechToTextResult()

    # Open the kaldi json
    with open(kaldi_file) as json_file:
        data = json.load(json_file)

    # Get the kaldi transcript
    transcript = open(kaldi_transcript_file, "r")
    results.transcript = transcript.read()

    # Get a list of words
    words = data["words"]
    duration = 0.00

    # For each word, add a word to our results
    for w in words:
        time = float(w["time"])
        end = time + float(w["duration"])
        # Keep track of the last time and use it as the duration
        if end > duration:
            duration = end
        results.addWord("", time, end, w["word"], None, None)

    # Create the media objeect
    media = SpeechToTextMedia(duration, media_file)

    # Create the final object
    outputFile = SpeechToText(media, results)

    #write the output
    mgm_utils.write_json_file(outputFile, output_json_file)

示例#9

0

显示文件

class Application:
    """
    Application main class
    """
    def __init__(self, config):
        """
        Initialize
        :param config: configuration
        :type config: Config
        """
        self.command_processor = CommandProcessor(
            self._command_handlers(config.command_handlers))
        self.robot = Robot(config.apiai.client_access_token,
                           config.apiai.language,
                           self.command_processor.commands)
        self.speech_to_text = SpeechToText(
            config.speechkit.key, "", config.speechkit.recognition.language)
        self.text_to_speech = TextToSpeech(
            config.speechkit.synthesis.cache_size, config.speechkit.key,
            config.speechkit.synthesis.language,
            config.speechkit.synthesis.speaker,
            config.speechkit.synthesis.emotion,
            config.speechkit.synthesis.speed)
        self.record = SpeechCapture(config.record.silence_calculation_chunks,
                                    config.record.speech_level_coefficient,
                                    config.record.start_wait_chunks,
                                    config.record.finish_wait_chunks)

    def _handler(self, real_handler):
        return lambda args: real_handler(self, args)

    def _command_handlers(self, command_handlers):
        result = {}
        for command, handler in stdhandlers.items():
            result[command] = self._handler(handler)
        for command, handler in command_handlers.items():
            result[command] = self._handler(handler)
        return result

    def _process_answer(self, commands):
        finish, results = self.command_processor.process_commands(commands)
        if not finish:
            noempty_results = [
                result for result in results if result is not None
            ]
            play_list(noempty_results)

    def welcome(self):
        """
        Run robot welcome event
        """
        success, session_id, answer_commands = self.robot.welcome()
        if success:
            self.speech_to_text.uuid = session_id
            self._process_answer(answer_commands)

    def query(self):
        """
        Run speech request (if we'lll have it)
        """
        print("Listening")
        silent, record = self.record.record_mp3()
        if silent:
            print("Silent")
            return
        success, text = self.speech_to_text.convert(record)
        if not success:
            print("Not recognized")
            return
        print("User : "******"Robot commands : " + json.dumps(commands))
        if not success:
            return
        self._process_answer(commands)

    def main(self):
        self.welcome()
        while True:
            self.query()

示例#10

0

显示文件

文件： main.py 项目： YangZhao71/FreestylePi

class Andrew(object):
    """the rap voice assisstant
    """
    def __init__(self, detect_model="data/andrew2.net",
                       lyrics_model="data/keras_model_1200.h5",
                       lyrics_chars="data/chars.pkl"):
        # microphone
        self.mic = ResumableMicrophoneStream(SAMPLE_RATE, CHUNK_SIZE)

        # wake word detector
        self.detector = TriggerDetector(detect_model)

        # speech and language services
        self.speech_client = SpeechToText()
        self.luis = LangUnderstand()
        self.tts = TextToSpeech()

        # lyrics generator model
        self.lyrics_gen = LyricsGenerator(lyrics_model, lyrics_chars)

        self.pred_queue = DetectQueue(maxlen=5)
        self.is_wakeup = False

        # pytft display
        self.tft = TFTDisplay()
        self.tft_queue = queue.Queue()
        self.tft_thread = threading.Thread(target=self.tft_manage, args=())
        self.tft_thread.daemon = True
        self.tft_thread.start()

        self.notify("hi_there")


    def notify(self, topic="hi_there", is_async=False, audio_path="data/audio"):
        # Notify with local preset audio files
        from os.path import join, isfile
        audio_file = join(audio_path, f"{topic}.wav")
        if not isfile(audio_file):
            return

        self.tts.play_file(audio_file, is_async)


    def generate_rap(self, topic="", beat_path="data/beat"):
        """Generate rap and play
        """
        tts = self.tts
        lyrics_gen = self.lyrics_gen

        response = tts.generate_speech(f"hey, I can rap about {topic}")
        tts.play(response, True)

        # Generate based on topic
        lyrics_output = lyrics_gen.generate(topic)

        # Generate speech
        lyrics_speech = tts.generate_speech(lyrics_output)

        # Select beat
        beat_index = random.randint(0, 20)

        # Play beat and lyrics
        tts.play_file(f'{beat_path}/beat_{beat_index}.wav', True)
        tts.play(lyrics_speech)

    def get_weather_message(self, city="Ithaca"):
        import requests, json, os
        api_key = os.getenv('WEATHER_APIKEY')
        base_url = "https://api.openweathermap.org/data/2.5/weather?"
        city_name = f"{city},us"
        complete_url = f"{base_url}q={city_name}&units=imperial&APPID={api_key}"
        try:
            response = requests.get(complete_url)
            res = response.json()
            msg_weather = f"Today, it's {res['weather'][0]['description']} in {city}. "
            msg_temp = f"The temperature is {int(res['main']['temp'])} degrees."
            return msg_weather + msg_temp
        except:
            pass

        return ""


    def intent_recognize(self, text=""):
        """Recognize intent
        """
        luis = self.luis
        tts = self.tts

        # Get result from language understanding engine
        luis_result = luis.predict(text)
        intent = luis_result.top_scoring_intent.intent

        if intent == "Freestyle":
            entities = luis_result.entities
            entity_topic = "rap"
            if (len(entities) > 0):
                entity = entities[0]
                cprint(f'The topic is {entity.entity}', 'cyan')
                entity_topic = entity.entity
            self.generate_rap(entity_topic)

        elif intent == "Weather":
            response = tts.generate_speech("I will tell you the weather in Ithaca.")
            tts.play(response)

            weather = self.get_weather_message()
            response = tts.generate_speech(weather)
            tts.play(response)

        else:
            self.notify("sorry")


    def tft_manage(self):
        """Manage TFT display through state
        """
        self.tft.display_text("Andrew is waking up")
        status = {'state': 'None'}

        while True:
            if status['state'] is 'wait':
                self.tft.display_wave()

            elif status['state'] is 'listen':
                self.tft.display_wave((0, 255, 0))

            # Update the status
            try:
                update = self.tft_queue.get(block=False)
                if update is not None:
                    status = update

            except queue.Empty:
                continue


    def start(self):
        """Start listening and interacting
        """
        tft = self.tft
        tts = self.tts

        # Init stream
        with self.mic as stream:

            self.tft_queue.put({'state': 'listen'})

            while True:
                if not self.is_wakeup:
                    stream.closed = False

                    while not stream.closed:

                        stream.audio_input = []
                        audio_gen = stream.generator()

                        for chunk in audio_gen:
                            if not self.is_wakeup:

                                prob = self.detector.get_prediction(chunk)

                                self.pred_queue.append(prob > 0.6)
                                print('!' if prob > 0.6 else '.', end='', flush=True)

                                if (self.pred_queue.count >= 2):
                                    self.notify("hi")
                                    cprint(' Trigger word detected! \n', 'magenta')
                                    self.pred_queue.clear()
                                    self.is_wakeup = True
                                    stream.pause()
                                    break
                else:
                    cprint('Speech to text\n', 'green')

                    time.sleep(1)
                    stream.closed = False

                    try:
                        voice_command = self.speech_client.recognize(stream)

                        cprint(f'{voice_command}\n', 'yellow')
                        cprint('Recognition ended...\n', 'red')

                        stream.pause()

                        #tft.display_text(f'"{voice_command}"')

                        if ("goodbye" in voice_command):
                            self.notify("see_you")
                            exit()

                        if ("sorry" in voice_command):
                            self.notify("its_ok")

                        else:
                            cprint('Recognize intents...', 'cyan')
                            self.intent_recognize(voice_command)

                    except Exception as e:
                        cprint(f'Error: {e}', 'red')

                    self.is_wakeup = False

示例#11

0

显示文件

def main():
	(media_file, transcribe_file, output_stt_json_file, output_seg_json_file) = sys.argv[1:5]
		
	mgm_utils.exception_if_file_not_exist(transcribe_file)

	# Open the transcribe output
	with open(transcribe_file) as json_file:
		data = json.load(json_file)
		
	amp_results = SpeechToTextResult()

	# Fail if we don't have results
	if "results" not in data.keys():
		exit(1)

	aws_results = data["results"]

	if "transcripts" not in aws_results.keys():
		exit(1)

	# Parse transcript
	transcripts = aws_results["transcripts"]
	for t in transcripts:
		amp_results.transcript = amp_results.transcript + t["transcript"]

	# Fail if we don't have any keys
	if "items" not in aws_results.keys():
		exit(1)

	# Parse items (words)
	items = aws_results["items"]
	duration = 0.00
	
	# For each item, get the necessary parts and store as a word
	for i in items:
		alternatives = i["alternatives"]
		# Choose an alternative
		max_confidence = 0.00
		text = ""

		# Each word is stored as an "alternative".  Get the one with the maximum confidence
		for a in alternatives:
			if float(a["confidence"]) >= max_confidence:
				max_confidence = float(a["confidence"])
				text = a["content"]

		end_time = -1
		start_time = -1

		# Two types (punctionation, pronunciation).  Only keep times for pronunciation
		if i["type"] == "pronunciation":
			end_time = float(i["end_time"])
			start_time = float(i["start_time"])

			# If this is the greatest end time, store it as duration
			if end_time > duration:
				duration = end_time
		# Add the word to the results
		amp_results.addWord(i["type"], start_time, end_time, text, "confidence", max_confidence)
	
	# Create the media object
	media = SpeechToTextMedia(duration, media_file)

	# Create the final object
	outputFile = SpeechToText(media, amp_results)

	# Write the output
	mgm_utils.write_json_file(outputFile, output_stt_json_file)

	# Start segmentation schema with diarization data
	# Create a segmentation object to serialize
	seg_schema = Segmentation()

	# Create the media object
	segMedia = SegmentationMedia(duration, media_file)
	seg_schema.media = segMedia
	
	if "speaker_labels" in aws_results.keys():
		speakerLabels = aws_results["speaker_labels"]
		seg_schema.numSpeakers = speakerLabels["speakers"]

		# For each segment, get the start time, end time and speaker label
		segments = speakerLabels["segments"]
		for segment in segments:
			seg_schema.addDiarizationSegment(float(segment["start_time"]), float(segment["end_time"]), segment["speaker_label"])
		
	# Write the output
	mgm_utils.write_json_file(seg_schema, output_seg_json_file)

示例#12

0

显示文件

文件： aws_comprehend.py 项目： AudiovisualMetadataPlatform/galaxy

def main():
    (input_file, json_file, bucketName, dataAccessRoleArn) = sys.argv[1:5]

    # Read a list of categories to ignore when outputting entity list
    ignore_cats_list = list()

    if len(sys.argv) > 5:
        print("ignore cats:" + sys.argv[5])
        ignore_cats_list = split_ignore_list(sys.argv[5])

    # Variable declaration
    outputS3Uri = 's3://' + bucketName + '/'
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
    jobName = 'AwsComprehend-' + timestamp + ".json"
    inputS3Uri = outputS3Uri + jobName

    # Get the transcript text from the input file
    with open(input_file, 'r') as file:
        stt = SpeechToText().from_json(json.load(file))

    # Create the ner object
    ner = EntityExtraction()

    # Add the media information
    if stt is None or stt.results is None:
        mediaLength = 0
    else:
        mediaLength = len(stt.results.transcript)

    # If we have a blank file, don't error.  Create another blank json file to pass to the next process
    if mediaLength == 0:
        ner.media = EntityExtractionMedia(mediaLength, input_file)
        mgm_utils.write_json_file(ner, json_file)
        exit(0)

    # Create a temp file to upload to S3
    tmpfile = create_temp_transcript_file(jobName, stt.results.transcript)

    # Copy the temporary text file to S3
    copy_to_s3(tmpfile.name, bucketName, jobName)

    # Make call to aws comprehend
    output_uri = run_comprehend_job(jobName, inputS3Uri, outputS3Uri,
                                    dataAccessRoleArn)

    uncompressed_file = download_from_s3(output_uri, outputS3Uri, bucketName)

    if uncompressed_file is None:
        exit(1)

    comprehend_data = read_comprehend_response(uncompressed_file)

    ner.media = EntityExtractionMedia(mediaLength, input_file)

    # Variables for filling time offsets based on speech to text
    lastPos = 0  # Iterator to keep track of location in STT word
    sttWords = len(stt.results.words)  # Number of STT words

    if 'Entities' in comprehend_data.keys():
        for entity in comprehend_data["Entities"]:
            entity_type = entity["Type"]
            # Start and end time offsets
            start = None
            end = None
            text = entity["Text"]

            # Split the entity into an array of words based on whitespace
            entityParts = text.split()

            # For each word in the entity, find the corresponding word in the STT word list
            foundWordPos = None
            for entityPart in entityParts:
                for wordPos in range(lastPos, sttWords):
                    # If it matches, set the time offset.
                    word = stt.results.words[wordPos]
                    if clean_entity_word(
                            word.text) == clean_entity_word(entityPart):
                        # Keep track of last position to save iterations
                        foundWordPos = wordPos
                        # Set start if we haven't set it yet
                        if start is None:
                            start = word.start
                        end = word.end
                        break
                    else:
                        start = None
                        end = None
                        foundWordPos = None

            if start is not None:
                lastPos = foundWordPos
            else:
                print("Could not find word")
                print(text)
                print(entityParts)
                print(lastPos)
            if clean_text(
                    entity_type) not in ignore_cats_list and start is not None:
                ner.addEntity(
                    entity_type, text, None, None, "relevance",
                    float(entity["Score"]), start,
                    None)  #AMP-636 removed startOffset=endOffset=end=None

    #Write the json file
    mgm_utils.write_json_file(ner, json_file)

    #Cleanup temp files
    safe_delete(uncompressed_file)
    safe_delete(tmpfile.name)

示例#13

0

显示文件

文件： draftjs_to_transcript.py 项目： AudiovisualMetadataPlatform/galaxy

def main():
    (root_dir, from_draftjs, original_transcript,
     to_transcript) = sys.argv[1:5]

    # using output instead of input filename as the latter is unique while the former could be used by multiple jobs
    logger = MgmLogger(root_dir, "hmgm_transcript", to_transcript)
    sys.stdout = logger
    sys.stderr = logger

    try:
        # if from_draftjs is in error raise exception to notify HMGM job runner to fail the job
        # otherwise if from_draftjs doesn't exist yet, exit 1 to keep waiting
        mgm_utils.exit_if_file_not_ready(from_draftjs)
        print("Converting DraftJs " + from_draftjs + " to Transcript " +
              to_transcript)

        with open(from_draftjs) as json_file:
            d = json.load(json_file)
            data = eval(json.dumps(d))

        #read original file for extracting only the confidence score of each word
        original_input = open(original_transcript)
        original_json = json.loads(original_input.read())
        original_items = original_json["results"]["words"]

        #print("the data in editor output is:",data)
        results = SpeechToTextResult()
        word_type = text = ''
        confidence = start_time = end_time = -1
        duration = 0.0

        # draftJS input file here always came from converted and corrected AMP Transcript,
        # so it should always contain 'entityMap', otherwise error should occur
        #Standardising draft js format
        #         if "entityMap" in data.keys():
        transcript = ''
        entityMap = data["entityMap"]
        for i in range(0, len(entityMap.keys())):
            punctuation = ''
            if str(i) not in entityMap.keys():
                continue
            entity = entityMap[str(i)]
            if "data" in entity:
                if "text" in entity["data"].keys():
                    text = entity["data"]["text"]
                    transcript += entity["data"]["text"] + " "
                    if text[-1] in string.punctuation:  #[',','.','!','?']:
                        punctuation = text[-1]
                        text = text[0:-1]

                if "type" in entity:
                    entity_type = entity["type"]
                    if entity_type == "WORD":
                        word_type = "pronunciation"
                        if "start" in entity["data"]:
                            start_time = float(entity["data"]["start"])

                        if "end" in entity["data"]:
                            end_time = float(entity["data"]["end"])

                        if end_time > duration:
                            duration = end_time
                    else:
                        word_type = entity_type

            results.addWord(word_type, start_time, end_time, text,
                            "confidence", confidence)
            if len(punctuation) > 0:
                results.addWord('punctuation', None, None, punctuation,
                                "confidence", 0.0)

        results.transcript = transcript
        words = results.words
        #Now retrieving the confidence values from the original input file and assigning them to 'results'
        list_items = []
        list_result = []
        for i in range(0, len(original_items)):
            list_items.append(original_items[i]["text"])

        for j in range(0, len(words)):
            list_result.append(words[j].text)

        d = difflib.Differ()
        res = list(d.compare(list_items, list_result))
        i = j = 0
        word_count = len(words)
        original_item_count = len(original_items)
        print("original item count: " + str(original_item_count))
        print("word count: " + str(word_count))
        for ele in res:
            if j >= word_count or i >= original_item_count:
                break
            elif ele.startswith("- "):
                i += 1
            elif len(ele) > 2 and ele[0:2] == "+ ":
                words[j].score.scoreValue = 1.0
                j += 1
            elif ele[0:1] == " " and words[j].text == original_items[i]["text"]:
                if ("score" in original_items[i]):
                    words[j].score.scoreValue = float(
                        original_items[i]["score"]["scoreValue"])
                else:
                    words[
                        j].score.scoreValue = 1.0  # default score to 1.0 if not existing originally
                i += 1
                j += 1
            print("i: " + str(i) + " j:" + str(j))

        # Create the media object
        media = SpeechToTextMedia(duration, original_transcript)

        # Create the final object
        stt = SpeechToText(media, results)

        # Write the output
        mgm_utils.write_json_file(stt, to_transcript)
        print("Successfully converted from DraftJs " + from_draftjs +
              " to Transcript " + to_transcript)
        # as the last command in HMGM, implicitly exit 0 here to let the whole job complete in success
    except Exception as e:
        # as the last command in HMGM, exit -1 to let the whole job fail
        print(
            "Failed to convert from DraftJs " + from_draftjs +
            " to Transcript " + to_transcript, e)
        traceback.print_exc()
        sys.stdout.flush()
        exit(-1)

示例#14

0

显示文件

文件： main.py 项目： duongnguyen-leez-uq/python-tools

def reply(text_message, client, thread_id):
    if client.uid == thread_id:
        return

    mess = Message(text=text_message)
    client.send(mess, thread_id=thread_id, thread_type=ThreadType.USER)
    client.delay()


client = Bot('',
             '',
             max_tries=1,
             user_agent=cfg.user_agent,
             session_cookies=cfg.session_cookies)
client.startListening()

acc_demo = '100041985261746'
acc_trDuong = '100014187060145'
recg = SpeechToText('right shift', reply, (
    client,
    acc_trDuong,
))

while 1:
    client.doOneListen()
    recg.recognize_once()
while 1:
    pass

#speech_to_text.recognize('space', reply, (client, '100014187060145',))

示例#15

0

显示文件

文件： rest_server.py 项目： ximik666/Voice_ChatBot

def run(host, port, wsgi=False, https_mode=False):
    ''' Автовыбор доступного порта (если указан порт 0), загрузка языковой модели и нейронной сети и запуск сервера.
    1. wsgi - True: запуск WSGI сервера, False: запуск тестового Flask сервера
    2. https - True: запуск в режиме https (сертификат и ключ должны быть в cert.pem и key.pem), False: запуск в режиме http
    
    Самоподписанный сертификат можно получить, выполнив: openssl req -x509 -newkey rsa:4096 -nodes -out temp/cert.pem -keyout temp/key.pem -days 365 '''

    if port == 0:  # Если был введён порт 0, то автовыбор любого доступного порта
        try:
            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            sock.bind((host, 0))
            port = sock.getsockname()[1]
            log('выбран порт ' + str(port))
            sock.close()
        except socket.gaierror:
            log('адрес ' + host + ':' + str(port) + ' некорректен',
                level='error')
            sock.close()
            return
        except OSError:
            log('адрес ' + host + ':' + str(port) + ' недоступен',
                level='error')
            sock.close()
            return

    log('Flask v.' + flask_version + ', WSGIServer v.' + wsgi_version)
    log('установлен максимальный размер принимаемых данных: {:.2f} Кб'.format(
        max_content_length / 1024))

    name_dataset = f_name_w2v_model_plays[
        f_name_w2v_model_plays.rfind('w2v_model_') +
        len('w2v_model_'):f_name_w2v_model_plays.rfind('.bin')]
    log('загрузка обученной на наборе данных ' + name_dataset +
        ' модели seq2seq...')
    global ttt
    print()
    ttt = TextToText(f_name_w2v_model=f_name_w2v_model_plays,
                     f_name_model=f_name_model_plays,
                     f_name_model_weights=f_name_model_weights_plays)
    print()

    log('загрузка языковой модели для распознавания речи...')
    global stt
    stt = SpeechToText('from_file', name_dataset)

    log('загрузка синтезатора речи...')
    global tts
    tts = TextToSpeech('anna')

    if wsgi:
        global http_server
        if https_mode:
            log('WSGI сервер запущен на https://' + host + ':' + str(port) +
                ' (нажмите Ctrl+C или Ctrl+Z для выхода)')
        else:
            log('WSGI сервер запущен на http://' + host + ':' + str(port) +
                ' (нажмите Ctrl+C или Ctrl+Z для выхода)')
        try:
            if https_mode:
                http_server = WSGIServer((host, port),
                                         app,
                                         log=app.logger,
                                         error_log=app.logger,
                                         keyfile='temp/key.pem',
                                         certfile='temp/cert.pem')
            else:
                http_server = WSGIServer((host, port),
                                         app,
                                         log=app.logger,
                                         error_log=app.logger)
            http_server.serve_forever()
        except OSError:
            print()
            log('адрес ' + host + ':' + str(port) + ' недоступен',
                level='error')
    else:
        log('запуск тестового Flask сервера...')
        try:
            if https_mode:
                app.run(host=host,
                        port=port,
                        ssl_context=('temp/cert.pem', 'temp/key.pem'),
                        threaded=True,
                        debug=False)
            else:
                app.run(host=host, port=port, threaded=True, debug=False)
        except OSError:
            print()
            log('адрес ' + host + ':' + str(port) + ' недоступен',
                level='error')

示例#16

0

显示文件

文件： assistant.py 项目： SamGalanakis/intelligent-assistant-agent

def main():

    fail_safe = FailSafe()
    coref_solver = CorefSolver()
    print("Press H for help.")

    verbose = False
    if "--verbose" in sys.argv:
        verbose = True
        u = UtteranceBranching(coref_solver, verbose=True)
    else:
        u = UtteranceBranching(coref_solver)

    kb_file_name = None
    if "--kb" in sys.argv:
        pos = sys.argv.index("--kb")
        kb_file_name = sys.argv[pos + 1]

    q_file_name = None
    if "--q" in sys.argv:
        pos = sys.argv.index("--q")
        q_file_name = sys.argv[pos + 1]

    if kb_file_name:
        with open(kb_file_name, "r") as f:
            utterances = list(f)
            for utterance in utterances:
                print(
                    "-------------------------------------------------------")
                print(utterance)
                response = u.process(utterance[:-1])
                print(response)

    if q_file_name:
        with open(q_file_name, "r") as f:
            questions = list(f)
            for q in questions:
                print(
                    "-------------------------------------------------------")
                print(q)
                response = u.process(q[:-1])
                if response:
                    if response[-1] == "+":
                        question, fail_response, similarity = fail_safe.answer_questions(
                            q[:-1])
                        coref_solver.prev.pop()
                        # print("*********", similarity)
                        if similarity > 0.7:
                            response = fail_response
                        else:
                            response = response[:-1]
                else:
                    question, response, similarity = fail_safe.answer_questions(
                        q[:-1])
                    # print(similarity)
                    coref_solver.prev.pop()

                print("Bot: ", response)
            return

    speech_to_text = SpeechToText()

    while True:

        # type or say
        print("You: ", end='', flush=True)
        utterance = str(sys.stdin.readline())

        if utterance[:-1].lower() == "h":
            print("Press H for help.")
            print("Press S to view assistant's internal state.")
            print(
                "Press V in order to interect with the assistant with your voice."
            )
            continue

        if utterance[:-1].lower() == "s":
            u.internal_state()
            continue

        if utterance[:-1].lower() == "v":
            utterance = speech_to_text.process()
            print(utterance)
        else:
            utterance = utterance[:-1]

        response = u.process(utterance)

        if response:

            if response[-1] == "+":
                question, fail_response, similarity = fail_safe.answer_questions(
                    utterance)
                coref_solver.prev.pop()
                if similarity > 0.7:
                    response = fail_response
                else:
                    response = response[:-1]

            tts = gTTS(text=response, lang='en')
            tts.save("response.mp3")
            os.system("mpg123 response.mp3 2> /dev/null")

            print("Bot: ", response)

            if response in ["Glad we talked!", "Happy to help!", "Gooodbye!"]:
                break

        else:
            question, response, similarity = fail_safe.answer_questions(
                utterance)
            # print(similarity)
            coref_solver.prev.pop()
            print("Bot: ", response)

            tts = gTTS(text=response, lang='en')
            tts.save("response.mp3")
            os.system("mpg123 response.mp3 2> /dev/null")

        print()