def __init__(self, username, password): """Create SpeechToText object. username -- username for watson sst service password -- password for watson sst service """ self.user = username self.pas = password self.speech_to_text = SpeechToTextV1(username=username, password=password, x_watson_learning_opt_out=True)
def main(args): directory = args.videos_dir selected_cat = args.category speech_to_text = SpeechToTextV1(username=IBM_USERNAME, password=IBM_PASSWORD) # Read wave file names in videos directory audio_names = [] for video_file in os.listdir(os.path.join(directory, selected_cat)): if video_file.endswith(".wav") and not os.path.isfile( os.path.join(directory, selected_cat, video_file[0:-4] + '.json')): audio_names.append(video_file) audio_names = natsorted(audio_names) num_files = len(audio_names) print('found', num_files, 'files') #print(audio_names) # Read spreadsheet df = pd.read_excel(os.path.join(directory, selected_cat + '.xlsx')) for audio_name in audio_names: # For each video file, check if the link is available data = df[df['Video'].str.contains(audio_name[:-4]) == True] link = '' if data.shape[0] == 0: print('Not found in spredsheet:', audio_name) else: link = data.iloc[0]['Link'] # Extract text using Watson print('Extracting detailed text using Watson for', audio_name) audio_path = os.path.join(directory, selected_cat, audio_name) with open(audio_path, "rb") as audio_file: result = speech_to_text.recognize( audio_file, content_type="audio/wav", model='es-ES_BroadbandModel', timestamps=True, word_confidence=True, ).get_result() # add the link to the results result['link'] = link # save json file out_json_path = audio_path[:-4] + '.json' with open(out_json_path, 'w') as outfile: json.dump(result, outfile)
def get_txt(filename): record(filename) speech_to_text = SpeechToTextV1( username='******', password='******') with open(filename, 'rb') as audio_file: return (speech_to_text.recognize(audio_file, content_type='audio/wav', smart_formatting='true')['results'][0] ['alternatives'][0]['transcript'])
def sendToSTT(): audio_file = open("sample.wav", "rb") stt = SpeechToTextV1(username=USER, password=PSWD) result = stt.recognize(audio=audio_file, content_type=CONT_TYPE, model=LANG) result_dict = result.get_result() text = "" for i in range(len(result_dict["results"])): text += result_dict["results"][i]["alternatives"][0][ "transcript"] + '\n' return text
def stt(filename): speech_to_text = SpeechToTextV1( username='******', password='******', x_watson_learning_opt_out=False ) with open(join(dirname(__file__), filename), 'rb') as audio_file: result = (speech_to_text.recognize( audio_file, content_type='audio/wav', timestamps=False, word_confidence=False)) return result
def __init__(self): self.swrap = SWRAP(threshold=4 000, mode=SWRAP.STREAM) self.swrap.stream_start() self.get_key_and_pass() self.result = WatsonPrediction() self.watsonstt = SpeechToTextV1( username=self.username, password=self.password, x_watson_learning_opt_out=False ) log.info("Watson activated") self.spin()
def recognize_speech_ibm_plus(yourkey, url_address, audio_input, csv_output): # set your authorization speech_to_text = SpeechToTextV1(iam_apikey=yourkey, url=url_address) # use IBM API to recognize your audio files = audio_input with open(files, 'rb') as audio_file: speech_recognition_results = speech_to_text.recognize( audio=audio_file, model="en-US_NarrowbandModel", # there are 8 models in the IMB API, check their website and get the most suitable one content_type= 'audio/wav', # the format of your audio and wav is a recommended one timestamps= True, # choose if you want to get the onset and offset of your transcription ).get_result() # since the result is sent to a url in json, you need to download it with open("C:/Users/taotao/Desktop/research/test/data.json", "w") as write_file: json.dump(speech_recognition_results, write_file) data = r"C:/Users/taotao/Desktop/research/test/data.json" # Read and convert json to dictionary def js_r(data): with open(data, encoding='utf-8') as f_in: return (json.load(f_in)) my_dic_data = js_r(data) # make sure you remember the audio you opened and double check print(audio_input) # pick the lists we need from the dictionary and combine them in one list dict_step1 = my_dic_data['results'] words = [] for n in dict_step1: sentence = n['alternatives'][0]['timestamps'] words = words + sentence # change the list we get to a table and add a head to it list_headline = ['word', 'onset', 'offset'] table = pd.DataFrame(columns=list_headline, data=words) table[['onset']] = table[['onset']] * 1000 table[['offset']] = table[['offset']] * 1000 # to fit this table with datavyu, we need to change the order of the columns table = table[['onset', 'offset', 'word']] print(table) # store the table in a csv file table.to_csv(csv_output)
def _get_client(self, client_type): if client_type == "stt": return SpeechToTextV1( url=self.yml.get("speech_to_text").get("endpoint"), username=self.yml.get("speech_to_text").get("username"), password=self.yml.get("speech_to_text").get("password") ) else: return ToneAnalyzerV3( version=self.yml.get("tone_analyzer").get("version"), username=self.yml.get("tone_analyzer").get("username"), password=self.yml.get("tone_analyzer").get("password") )
def sendToSTT(): audio_file = open("sample.wav", "rb") stt = SpeechToTextV1(iam_apikey=KEY, url=URL) result = stt.recognize(audio=audio_file, content_type=CONT_TYPE, model=LANG) result_dict = result.get_result() text = "" print(result_dict) for i in range(len(result_dict["results"])): text += result_dict["results"][i]["alternatives"][0][ "transcript"] + '\n' return text
def call_to_watson_speech_to_text(filepath): speech_to_text = SpeechToTextV1(username=WATSON_SPT_SERVICE_USERNAME, password=WATSON_SPT_SERVICE_PASSWORD, x_watson_learning_opt_out=False) models = speech_to_text.models() us_model = speech_to_text.get_model('en-US_BroadbandModel') with open(filepath, 'rb') as audio_file: results = speech_to_text.recognize(audio_file, content_type='audio/wav', timestamps=True, word_confidence=True, speaker_labels=True) return results
def __init__(self, key, url): """Call super constructors and auth to IBM sdk by creating sdk stt interface Args: url (str): The IBM API url can be found in service credentials of stt service key (str): The IBM API key can be found in service credentials of stt service """ ProcessService.__init__(self) RecognizeCallback.__init__(self) self.url = url self.key = key self.interface = SpeechToTextV1(iam_apikey=self.key, url=self.url)
def send_to_watson(): audio_file = open("sample.wav", "rb") stt = SpeechToTextV1(username=USER, password=PSWD) result = stt.recognize(audio=audio_file, content_type=CONT_TYPE, model=LANG) text = "" result_dict = result.get_result() for i in range(len(result_dict['results'])): text += result_dict['results'][i]['alternatives'][0]['transcript'] return text
def texttospeech(request): speech_to_text = SpeechToTextV1( iam_apikey='ivcz4sw1451NvNDgU_9Jfc9y4EqIpo4Qmy8iW4X8x-xX', url='https://stream.watsonplatform.net/speech-to-text/api') try: speech_recognition_results = speech_to_text.recognize( audio=request.FILES["audio"], content_type='audio/wav').get_result() data = speech_recognition_results text = data.get("results")[0].get("alternatives")[0].get("transcript") personality_insights = PersonalityInsightsV3( version='2018-09-20', iam_apikey='JoSYNcMGd-pWBUQV289Fv8gh0kFpH5_SDCENobZTruqA', url='https://gateway.watsonplatform.net/personality-insights/api') try: text_file = open("file.txt", "r") profile = personality_insights.profile( content=text_file.read(), accept='application/json', content_type='text/plain').get_result() results = {} personality = profile.get("personality") openness = personality[0].get("children") conscientiousness = personality[1].get("children") extraversion = personality[2].get("children") agreeableness = personality[3].get("children") neuroticism = personality[4].get("children") traits = [ openness, conscientiousness, extraversion, agreeableness, neuroticism ] for trait in traits: for x in range(5): name = trait[x].get("name") percentile = trait[x].get("percentile") results[name] = percentile return HttpResponse(json.dumps(results)) except WatsonApiException as ex: print("Method failed: " + ex.message + ": " + str(ex.code)) return render(request, 'vfl/ser1.html') except WatsonApiException as ex: print("Method failed with status code " + ex.message) return render(request, 'vfl/ser1.html')
def convert_audio_to_text(file): speech_to_text = SpeechToTextV1( username='******', password='******', x_watson_learning_opt_out=False) with open(file, 'rb') as audio: output = speech_to_text.recognize(audio, content_type='audio/flac', timestamps=True, word_confidence=True) text = output['results'][0]['alternatives'][0]['transcript'] return text
def speech_to_text(filename, model_id): #create a client stt = SpeechToTextV1(iam_apikey=keys.speech_to_text_key) #open file for transcribing. #the SpeechToTextV1 recognize() method returns a DetailedResponse object: https://cloud.ibm.com/apidocs/speech-to-text/speech-to-text?code=python#response-details #we get the JSON of the transcription result with open(filename, 'rb') as audio_file: response = stt.recognize(audio=audio_file, content_type='audio/wav', model=model_id).get_result() #get results transcript = response['results'][0]['alternatives'][0]['transcript'] #return results return transcript
def receive_audio(speech_file): speech_to_text = SpeechToTextV1( username='******', password='******', x_watson_learning_opt_out=False) with open(join(dirname(__file__), speech_file), 'rb') as audio_file: text = json.dumps(speech_to_text.recognize(audio_file, content_type='audio/wav', timestamps=True, word_confidence=True), indent=2) return text
def __init__(self): super(IBMWatsonUtility, self).__init__() self.TTS = TextToSpeechV1( username='******', password='******', x_watson_learning_opt_out=False) self.STT = SpeechToTextV1( username='******', password='******', x_watson_learning_opt_out=False) self.threshold = 500 self.chunk_size = 1024 self.format = pyaudio.paInt16 self.rate = 44100
def __init__(self, ): self.MIN_ANSWER_LEN = 5 self.MIN_CONFIDENCE = 0.60 self.SMALL_TALK = ['I see.', 'Got it.', 'Ok', 'Interesting'] self.POSITIVE_REMARK = [ "Good.", "Excellent!", "Sounds great!", "That's awesome!", "Wonderful!" ] self.NEGATIVE_REMARK = [ "I'm sad to hear that.", "That doesn't sound very good.", "I'm sad to hear that.", "ah", "Someone forgot to have their coffee today" ] self.questions = [ 'Tell me about yourself', 'Tell me about a recent project that you worked on', 'What are your greatest weaknesses?', 'What did you dislike the most about your last job?', 'If you were an animal, which one would you want to be?', 'What are your hobbies?', 'What is your greatest professional achievement?', 'Why do you want to work here?', 'What are your strengths?', 'Where do you see yourself in five years?', 'What type of work environment do you prefer?', "What's a time you disagreed with a decision that was made at work?", 'Why was there a gap in your employment?', 'Can you explain why you changed career paths?', 'How do you deal with pressure or stressful situations?', 'What would your first 30, 60, or 90 days look like in this role?', 'What are your salary requirements?', 'How many tennis balls can you fit into a limousine?', 'Are you planning on having children?', 'How many ping pong balls fit on a 737?', 'Describe a difficult work situation / project and how you overcame it', 'How are you different from the competition?', 'Do you take work home with you?', 'How do you view yourself? Whom do you compare yourself to?', 'What motivates you', 'What did you like most about your last job?', 'What did you dislike most about your last job?', 'Why should I take a risk on you?' ] self.text_to_speech = TextToSpeechV1( x_watson_learning_opt_out=True) # Optional flag self.speech_to_text = SpeechToTextV1(x_watson_learning_opt_out=False) self.nlu = NLU(version='2017-02-27') self.TEMPFILE = './temp/output.wav' self.answers, self.sentiments = [], []
def get_sound_text(name): ''' This function accepts one string name and returns the string that the audio file refers to. name can be any 8 values below: name_list = ['airplane','ball','book','helicopter','laptop','ocean','strawberry','train'] ''' import json from os.path import join, dirname from watson_developer_cloud import SpeechToTextV1 name_list = [ 'airplane', 'ball', 'book', 'helicopter', 'laptop', 'ocean', 'strawberry', 'train' ] if name not in name_list: return 'You gave "name" a wrong value, it is not in our list' speech_to_text = SpeechToTextV1( username='******', password='******', x_watson_learning_opt_out=False) with open( join( dirname(__file__), '/home/steve/Documents/Presentation/request_sound/' + name + '.mp3'), 'rb') as audio_file: watson_result = json.dumps(speech_to_text.recognize( audio_file, content_type='audio/mp3', timestamps=False, word_confidence=True), indent=2) print('Watson\'s sound guessing result is: ') print(watson_result ) # Here we get watson's result and we print watson's result resultstr = str( watson_result ) # Below we slice the result and keep the string that the sound refers to resultlist = resultstr.splitlines() for i in resultlist: if "transcript" in i: termstr = i # termstr contains "transcript": "correct_word"(it's our word) start_index = termstr.find(': "') rm_start_str = termstr[start_index + 3:] end_index = rm_start_str.find('"') return rm_start_str[:end_index - 1] # The return value is exactly the string that the sound refers to
def speechToText(filePath): modified_file_path = filePath[0:len(filePath) - 3] + 'txt' txt_file = open(modified_file_path, 'w') speech_to_text = SpeechToTextV1( username='******', password='******', x_watson_learning_opt_out=False ) with open(join(dirname(__file__), filePath),'rb') as audio_file: txt_file.write(json.dumps(speech_to_text.recognize( audio_file, content_type='audio/wav', timestamps=True, word_confidence=True), indent=2)) txt_file.close() return modified_file_path
def calc_watson_STT(self, afile): speech_to_text = SpeechToTextV1( username="******", password="******", x_watson_learning_opt_out=False) print( json.dumps(speech_to_text.get_model('en-US_BroadbandModel'), indent=2)) with open(afile, 'rb') as audio_file: stt = speech_to_text.recognize(audio_file, content_type='audio/wav', timestamps=True, word_confidence=False) return stt
def TranscodeFromFile(path, sample_rate): try: with io.open(path, 'rb') as audio_file: # watson connection stt = SpeechToTextV1( iam_apikey=model.key.WATSON_APIKEY, url=model.key.WATSON_URL) response = stt.recognize( audio=audio_file, content_type=cont_type, model=lang) result_json = response.result for i in range(len(result_json["results"])): logger.debug( result_json["results"][i]["alternatives"][0]["transcript"]) return result_json["results"][0]["alternatives"][0]["transcript"] except: return ""
def spe2tex(directory): speech_to_text = SpeechToTextV1( username='******', password='******', x_watson_learning_opt_out=False) with open(directory, 'rb') as audio_file: return (json.dumps(speech_to_text.recognize( audio_file, content_type='audio/mp3', timestamps=True, model='en-US_BroadbandModel', word_confidence=True), indent=2, encoding='UTF-8', ensure_ascii=False))
def audio2text(config,url): # return the response from IBM for speech recognition speech_to_text = SpeechToTextV1( username=config['username'], password=config['password'], url='https://stream.watsonplatform.net/speech-to-text/api') # ibm.com/watson/developercloud/speech-to-text/api/v1/python.html?python#recognize-sessionless with open(get_path(url), 'rb') as audio_file: speech_recognition_results = speech_to_text.recognize( audio=audio_file, # file content_type='audio/wav', # specify audio type model='en-US_BroadbandModel', # speech recognition model smart_formatting=False, # identify proper noun timestamps=True, # return timestamps of each word max_alternatives=1) # number of guessed word return speech_recognition_results
def getTextFromFile(audio_file): speech_to_text = SpeechToTextV1(iam_apikey=api_key, url=url) speech_to_text.set_detailed_response(True) speech_recognition_results = speech_to_text.recognize( audio=audio_file, content_type='audio/wav', timestamps=True ).get_result()['results'] result = speech_recognition_results[0] alternatives = result['alternatives'] alternative = (SpeechRecognitionAlternative)(alternatives[0]) transcript = alternative.transcript['transcript'] return transcript
def __init__(self): self.CHUNK = 1024 self.BUF_MAX_SIZE = self.CHUNK * 10 self.q = Queue(maxsize=int(round(self.BUF_MAX_SIZE / self.CHUNK))) self.audio_source = AudioSource(self.q, True, True) self.FORMAT = pyaudio.paInt16 self.CHANNELS = 1 self.RATE = 44100 self.__apikey_stt = Config().Get("SpeechToText", "WatsonSTTAPIKey") self.__url_stt = Config().Get("SpeechToText", "WatsonSTTUrl") self.__apikey_tts = Config().Get("TextToSpeech", "WatsonTTSAPIKey") self.__url_tts = Config().Get("TextToSpeech", "WatsonTTSUrl") self.__voiceName = Config().Get("TextToSpeech", "WatsonVoiceName") self.__language_2letter_cc = Config().Get("SpeechToText", "CountryCode2Letter") self.__language_4letter_cc = Config().Get("SpeechToText", "CountryCode4Letter") self.__audioPlayer = Config().Get("TextToSpeech", "AudioPlayer") + " '{0}'" self.text_to_speech = TextToSpeechV1(url=self.__url_tts, iam_apikey=self.__apikey_tts) self.text_to_speech.set_default_headers( {'x-watson-learning-opt-out': "true"}) self.speech_to_text = SpeechToTextV1(url=self.__url_stt, iam_apikey=self.__apikey_stt) self.speech_to_text.set_default_headers( {'x-watson-learning-opt-out': "true"}) self.audio = pyaudio.PyAudio() # open stream using callback self.stream = self.audio.open(format=self.FORMAT, channels=self.CHANNELS, rate=self.RATE, input=True, frames_per_buffer=self.CHUNK, stream_callback=self.pyaudio_callback, start=False) try: rospy.init_node('STT_watson_node', anonymous=True) except: FileLogger().Info('already initialized')
def speech_2_text(file_name): speech_to_text = SpeechToTextV1(username='', password='', x_watson_learning_opt_out=False) speech_to_text.get_model('en-US_BroadbandModel') with open(file_name, 'rb') as audio_file: results = speech_to_text.recognize(audio_file, content_type='audio/wav', timestamps=True, word_confidence=True) first_array = results["results"] transcript = '' for element in first_array: transcript += element["alternatives"][0]["transcript"] + ' ' return transcript
def submit(request): info = request.POST['info'] print("Submit worked") speech_to_text = SpeechToTextV1( username='******', password='******', x_watson_learning_opt_out=False) print(json.dumps(speech_to_text.models(), indent=2)) print( json.dumps(speech_to_text.get_model('en-US_BroadbandModel'), indent=2)) text = transcribe_audio('../speech.wav') print(text) return render(request, "about.html", {"text": text})
def speech_to_text(path): speech_to_text = SpeechToTextV1( username = "******", password = "******", x_watson_learning_opt_out=False ) with open(join(dirname(__file__), path), 'rb') as audio_file: json_text=((speech_to_text.recognize( audio_file, content_type='audio/wav', timestamps=True, word_confidence=False))) sec=int(json_text['results'][0]['alternatives'][0]['timestamps'][-1][2]) json_analysis=analyze(json_text['results'][0]['alternatives'][0]['transcript'],sec ) print(json_analysis) #Send to server db['Text'].insert_one(json_text)
def text_json(out_f="out1.mp3", lang_k="ja-JP_BroadbandModel"): user = '******' pswd = 'パスワード' audio_file = open(out_f, "rb") ext = os.path.splitext(out_f)[1][1:] cont_type = "audio/" + ext print(cont_type) lang = lang_k # ワトソンとの送信と受信 stt = SpeechToTextV1(username=user, password=pswd) result_json = stt.recognize(audio=audio_file, content_type=cont_type, model=lang).get_result() # ファイルの保存 with open("result.json", "w") as f: json.dump(result_json, f, ensure_ascii=False, indent=2)