def transcribe_wav(in_fname): """generates transcription of a given wav file args: in_fname: file name of the wav file that should be transcribed returns: transcription of the wav file """ tmp_fname1 = get_unique_fname('../tmp/extended', '.wav') tmp_fname2 = get_unique_fname('../tmp/transcribe', '.log') # prepend some silence (first bit of speech might else be treated as noise) subprocess.check_call(['praat', '../misc/prepend_silence.praat', in_fname, tmp_fname1]) # run pocketsphinx (printing to log so only transcript is written to stdout) comp_proc = subprocess.run( ['pocketsphinx_continuous', '-infile', tmp_fname1, '-logfn', tmp_fname2], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) remove(tmp_fname1) remove(tmp_fname2) return comp_proc.stdout.decode("utf-8").replace('\n', '').replace('\r', '')
def detect_tts_speech_rate(tts_type, voice, rate_modifier, ip_addr=None, port=None): """determines tts average speech rate for given rate modifier args: tts_type: tts software to use; specified by one of the TTS_TYPE_* constants at the beginning of this module (only those tts are supported); voice: voice to be used for synthesis rate_modifier: rate modifier (ssml/sable) to be used for synthesis returns: mean speech rate and standard deviation, in syllables per second """ syll_rates = [] corpus = load_syllable_count_corpus() for line in corpus: if tts_type == TTS_TYPE_MARY: in_str = get_ssml(line[1], rate_modifier) input_type = INPUT_TYPE_SSML else: in_str = get_sable(line[1], rate_modifier) input_type = INPUT_TYPE_SABLE out_fname = get_unique_fname('../tmp/speech_rate', '.wav') try: synthesize(in_str, False, input_type, out_fname, tts_type, ip_addr, port, voice) except requests.exceptions.HTTPError: continue duration = float(extract_feature_values(out_fname)['speech_duration']) syll_rates.append(line[0]/duration) remove(out_fname) return sum(syll_rates) / len(syll_rates), numpy.std(syll_rates)
def extract_feature_values(in_fname): """runs a praat script to extract a given wav file's feature values args: in_fname: name of the wav file which should be analyzed returns: a dictionary containing several feature values, like intensity_mean raises: subprocess.CalledProcessError: script call did not return with code 0 """ tmp_fname = get_unique_fname('../tmp/features', '.txt') subprocess.check_call(['praat', '../misc/extract_features.praat', in_fname, tmp_fname]) # extract comma-separated key value pairs from output file, then delete it with open(tmp_fname, 'r') as out_file: lines = out_file.readlines() feat_val_dict = {} for line in lines: key, val = line.replace('\n', '').split(',') feat_val_dict[key] = val remove(tmp_fname) return feat_val_dict
def main(): """main function called if the module is run directly and not just imported """ print('this is an interactive dialog system using speech input and output.' '\nit is based on the eliza system, which means its ' 'setting is that of rogerian psychotherapy.\nafter each output from ' 'the system, please hit enter to start recording your response and ' 'enter again to stop recording.\nyou might see some error messages ' 'even if the system works without issue in which case you can ' 'ignore them.\nhit enter now to start.') sys.stdin.read(1) tmp_fname = get_unique_fname('../tmp/%s_eliza_in', '.wav') in_str = 'hello, i am a psychotherapist. please tell me about your ' \ 'problems.' remote_tts.synthesize(in_str, out_fname=tmp_fname) print('me: %s' % in_str) play_audio(tmp_fname) remove(tmp_fname) # loop indefinitely, only stop if the user requests it while True: in_fname = get_unique_fname('../tmp/%s_eliza_in', '.wav') out_fname = get_unique_fname('../tmp/%s_eliza_out', '.wav') print('please hit enter and say your response or type "stop" to stop') written_input = input() if written_input == 'stop': break record_audio(in_fname) in_str = remote_tts.transcribe_wav(in_fname) print('you: %s' % in_str) out_str = generate_response(in_str).lower() print('me: %s' % out_str) remote_tts.synthesize_alike(out_str, in_fname, out_fname=out_fname) play_audio(out_fname) remove(in_fname) remove(out_fname)
def synthesize_with_features(in_str, speech_rate=None, intensity=None, pitch=None, in_str_is_fname=False, out_fname=None, tts_type=None, ip_addr=None, port=None, voice=None, speech_rates_dict=None): """generates wav from plain text with given speech rate, intensity and pitch args: in_str: text which should be synthesized; either directly plain text or the name of a file from which to read plain text speech_rate: target mean speech rate in syllables per second (3.0-8.0) intensity: target mean intensity in decibel pitch: target mean pitch in hertz speech_rates_dict: see load_speech_rates_dict(); offered as a parameter so it can be loaded once and reused for efficiency; loaded in this function if none given (for details on other parameters see synthesize()) returns and raises: see synthesize() """ # if in_str is a file name, read string to synthesize from that file if in_str_is_fname: with open(in_str, 'r') as in_file: in_str = ''.join(in_file.readlines()) speech_rates_dict = (speech_rates_dict if speech_rates_dict else load_speech_rates_dict()) pitch = pitch if pitch else 'default' # adjust target speech rate to be within the supported range if speech_rate < 3.0: speech_rate = 3.0 elif speech_rate > 8.0: speech_rate = 8.0 # generate appropriate markup from plain text; only speech rate and pitch # are adjusted that way, intensity through praat (this combination is most # efficient and accurate) if not tts_type or tts_type == TTS_TYPE_MARY: input_type = INPUT_TYPE_SSML voice = voice if voice else DEFAULT_VOICE_MARY if speech_rate: rate_modifier = \ speech_rates_dict['mary'][voice][round(speech_rate, 1)] else: rate_modifier = 'default' in_str = get_ssml(in_str, rate_modifier, pitch) elif tts_type == TTS_TYPE_FESTIVAL: input_type = INPUT_TYPE_SABLE voice = voice if voice else DEFAULT_VOICE_MARY if speech_rate: rate_modifier = \ speech_rates_dict['festival'][voice][round(speech_rate, 1)] else: rate_modifier = 'default' in_str = get_sable(in_str, rate_modifier, pitch) else: raise ValueError('given tts_type not supported') tmp_fname = synthesize(in_str, False, input_type, None, tts_type, ip_addr, port, voice) out_fname = out_fname if out_fname \ else get_unique_fname('../tmp/synthesis_final', '.wav') adapt_wav(tmp_fname, out_fname, intensity=intensity) remove(tmp_fname) return out_fname
def synthesize(in_str, in_str_is_fname=False, input_type=None, out_fname=None, tts_type=None, ip_addr=None, port=None, voice=None): """sends given string to a tts server and writes response to a file args: in_str: plain text or markup string for synthesis or name of a file that contains such a string in_str_is_fname: whether in_str should be treated as a file name (True) or directly as a string to synthesize (False, default) input_type: whether the input is plain text (default) or some markup; specified by one of the INPUT_TYPE_* constants at the beginning of this module out_fname: server response is written to this file location; if the return status is not ok, this contains additional info; if no name is given, a default will be used and returned by this function ip_addr: ip address of the tts server, localhost is used if none given port: port of the tts server, default for tts is used if none given tts_type: tts software to use; specified by one of the TTS_TYPE_* constants at the beginning of this module (only those tts are supported); marytts is used if none given voice: name of the voice to use; default for tts is used if none given returns: name of the output file, same as out_fname if that was given raises: requests.exceptions.RequestException: the connection failed or the server did not return an ok status ValueError: the given tts_type or input_type is not supported """ # if in_str is a file name, read string to synthesize from that file if in_str_is_fname: with open(in_str, 'r') as in_file: in_str = ''.join(in_file.readlines()) # set defaults for missing parameters input_type = input_type if input_type else INPUT_TYPE_TEXT tts_type = tts_type if tts_type else TTS_TYPE_MARY ip_addr = ip_addr if ip_addr else '127.0.0.1' out_fname = out_fname if out_fname \ else get_unique_fname('../tmp/synthesis', '.wav') if tts_type == TTS_TYPE_MARY: port = port if port else 59125 voice = voice if voice else DEFAULT_VOICE_MARY elif tts_type == TTS_TYPE_FESTIVAL: port = port if port else 1314 voice = voice if voice else DEFAULT_VOICE_FESTIVAL # communicate with tts server in individually appropriate way if tts_type == TTS_TYPE_MARY: if input_type != INPUT_TYPE_TEXT and input_type != INPUT_TYPE_SSML: raise ValueError('given input_type not supported for marytts') params = { 'INPUT_TEXT': in_str, 'INPUT_TYPE': input_type, 'OUTPUT_TYPE': 'AUDIO', 'LOCALE': 'en_US', 'AUDIO': 'WAVE_FILE', 'VOICE': voice } resp = requests.post('http://%s:%d/process' % (ip_addr, port), data=params, stream=True) with open(out_fname, 'wb') as out_file: for chunk in resp.iter_content(8192): out_file.write(chunk) # raise exception if http request came back with an error; do this only # after writing output so failure response is logged # TODO: write to different file (txt, not wav) and include note in msg? resp.raise_for_status() elif tts_type == TTS_TYPE_FESTIVAL: args = ['festival_client', '--server', ip_addr, '--port', str(port), '--ttw', '--otype', 'wav', '--output', out_fname] prolog_fname = None if input_type == INPUT_TYPE_TEXT: # for plain text input, voice must be specified in prolog file prolog_fname = get_unique_fname('../tmp/festival_prolog', '.wav') with open(prolog_fname, 'wb') as prolog_file: prolog_file.write(('(%s)' % voice).encode('utf-8')) args.append('--prolog') args.append(prolog_fname) elif input_type == INPUT_TYPE_SABLE: # for sable input, tts_mode option must be set args.append('--tts_mode') args.append('sable') # for consistent interface in this function, voice is not assumed to # already be specified in given sable string but set here instead in_str = in_str.replace('<<<voice>>>', voice) else: raise ValueError('given input_type not supported for festivaltts') in_fname = get_unique_fname('../tmp/festival_input') with open(in_fname, 'wb') as tmp_file: tmp_file.write(in_str.encode('utf-8')) args.append(in_fname) subprocess.check_call(args) remove(in_fname) if prolog_fname: remove(prolog_fname) else: raise ValueError('given tts_type not supported') return out_fname