def processLine(line,mary,scriptpath): #http://stackoverflow.com/questions/10190981/get-a-unique-id-for-worker-in-python-multiprocessing-pool tokens = [] try: proc_num = multiprocessing.current_process()._identity[0]-1 tokens,phonemes = common_utils.getCleanTokensAndPhonemes(line,mary,proc_num) except Exception as err: print '[',proc_num,']','Error, omitting', line print err if scriptpath != '' and ('Read timed out' in str(err)): print 'restarting maryServer' restartMaryServer(scriptpath,None) return ' '.join(tokens)
def getUtterances(ids, postfix_speaker ,cache_cleaned_sentences = True): '''Loads the corpus and gets python structured object that can be used to export the corpus to a format KALDI understands''' utts= [] cleaned_sentences_cache = {} utts_phoneme_dict = {} print 'Reading corpus transcriptions and producing automatic corpus phoneme dict for OOV words (you need MARY running in the background!)' lastutt = None for myid in ids: print '.', #if 1==1: try: with codecs.open(myid+'.xml','r','utf-8') as myfile: #extract xml meta xml = myfile.read() soup = BeautifulSoup(xml) sentence = soup.recording.sentence.string cleaned_sentence = soup.recording.cleaned_sentence.string gender = soup.recording.gender.string age = soup.recording.ageclass.string corpus = soup.recording.corpus.string nativespeaker = soup.recording.muttersprachler.string region = soup.recording.bundesland.string speakerid= soup.recording.speaker_id.string if speakerid is None or speakerid == '': print 'ERROR, speakerid not found for', myid date = getDateFromID(myid) if cache_cleaned_sentences and (cleaned_sentence not in cleaned_sentences_cache): clean_sentence_tokens,token_phonemes = common_utils.getCleanTokensAndPhonemes(cleaned_sentence,mary) cleaned_sentences_cache[cleaned_sentence] = (clean_sentence_tokens,token_phonemes) #print 'cleaning ', cleaned_sentence, ' -> ', clean_sentence_tokens , ' phonemes:', token_phonemes else: clean_sentence_tokens,token_phonemes = cleaned_sentences_cache[cleaned_sentence] if not cache_cleaned_sentences: clean_sentence_tokens,token_phonemes = common_utils.getCleanTokensAndPhonemes(sentence,mary) for token,phoneme_representation in itertools.izip(clean_sentence_tokens,token_phonemes): if token not in utts_phoneme_dict: utts_phoneme_dict[token] = phoneme_representation clean_sentence_tokens = cleaned_sentence.split(' ') utt = {'id':myid.split('/')[-1],'fileid':myid,'sentence':sentence,'clean_sentence_tokens':clean_sentence_tokens,'speakerid':speakerid,'gender':gender,'age':age,'corpus':corpus,'nativespeaker':nativespeaker,'region':region,'date':date} utts.append(utt) except Exception as err: print 'Error in file, omitting', myid print err #Sort utterances by date utts = sorted(utts,key=lambda utt:utt['date']) #Unfortunately, the xmls dont have speaker meta-information, we try to guess it here #for i,utt in enumerate(utts): # if lastutt is not None: # delta = utt['date'] - lastutt['date'] # diff = abs(delta.total_seconds()) # #Heuristic: either a enough time passed between this and the last recording, or speaker meta information (gender,age,region) changed # if diff > speakerid_diff_heuristic or lastutt['gender'] != utt['gender'] or lastutt['age'] != utt['age'] or lastutt['region'] != utt['region']: # print 'probable new speaker',speakerid # if diff > speakerid_diff_heuristic: # print 'based on time diff',diff # else: # print 'based on meta', 'diff:',diff, lastutt['gender'],utt['gender'],lastutt['age'],utt['age'],lastutt['region'],utt['region'] # speakerid += 1 # utt['speakerid'] = 's'+('%04d'%speakerid)+postfix_speaker for utt in utts: utt['kaldi_id'] = utt['speakerid']+'_'+utt['id'] #utts[i] = utt #lastutt = utt #Filter utterances with repeat in file name (recording was repeated after a wrong utterance) #utts = filterRepeatUtterances(utts) return utts,utts_phoneme_dict
def getUtterances(ids, postfix_speaker, cache_cleaned_sentences=True): '''Loads the corpus and gets python structured object that can be used to export the corpus to a format KALDI understands''' utts = [] cleaned_sentences_cache = {} utts_phoneme_dict = {} print 'Reading corpus transcriptions and producing automatic corpus phoneme dict for OOV words (you need MARY running in the background!)' lastutt = None for myid in ids: print '.', #if 1==1: try: with codecs.open(myid + '.xml', 'r', 'utf-8') as myfile: #extract xml meta xml = myfile.read() soup = BeautifulSoup(xml) sentence = soup.recording.sentence.string cleaned_sentence = soup.recording.cleaned_sentence.string gender = soup.recording.gender.string age = soup.recording.ageclass.string corpus = soup.recording.corpus.string nativespeaker = soup.recording.muttersprachler.string region = soup.recording.bundesland.string speakerid = soup.recording.speaker_id.string if speakerid is None or speakerid == '': print 'ERROR, speakerid not found for', myid date = getDateFromID(myid) if cache_cleaned_sentences and ( cleaned_sentence not in cleaned_sentences_cache): clean_sentence_tokens, token_phonemes = common_utils.getCleanTokensAndPhonemes( cleaned_sentence, mary) cleaned_sentences_cache[cleaned_sentence] = ( clean_sentence_tokens, token_phonemes) #print 'cleaning ', cleaned_sentence, ' -> ', clean_sentence_tokens , ' phonemes:', token_phonemes else: clean_sentence_tokens, token_phonemes = cleaned_sentences_cache[ cleaned_sentence] if not cache_cleaned_sentences: clean_sentence_tokens, token_phonemes = common_utils.getCleanTokensAndPhonemes( sentence, mary) for token, phoneme_representation in itertools.izip( clean_sentence_tokens, token_phonemes): if token not in utts_phoneme_dict: utts_phoneme_dict[token] = phoneme_representation clean_sentence_tokens = cleaned_sentence.split(' ') utt = { 'id': myid.split('/')[-1], 'fileid': myid, 'sentence': sentence, 'clean_sentence_tokens': clean_sentence_tokens, 'speakerid': speakerid, 'gender': gender, 'age': age, 'corpus': corpus, 'nativespeaker': nativespeaker, 'region': region, 'date': date } utts.append(utt) except Exception as err: print 'Error in file, omitting', myid print err #Sort utterances by date utts = sorted(utts, key=lambda utt: utt['date']) #Unfortunately, the xmls dont have speaker meta-information, we try to guess it here #for i,utt in enumerate(utts): # if lastutt is not None: # delta = utt['date'] - lastutt['date'] # diff = abs(delta.total_seconds()) # #Heuristic: either a enough time passed between this and the last recording, or speaker meta information (gender,age,region) changed # if diff > speakerid_diff_heuristic or lastutt['gender'] != utt['gender'] or lastutt['age'] != utt['age'] or lastutt['region'] != utt['region']: # print 'probable new speaker',speakerid # if diff > speakerid_diff_heuristic: # print 'based on time diff',diff # else: # print 'based on meta', 'diff:',diff, lastutt['gender'],utt['gender'],lastutt['age'],utt['age'],lastutt['region'],utt['region'] # speakerid += 1 # utt['speakerid'] = 's'+('%04d'%speakerid)+postfix_speaker for utt in utts: utt['kaldi_id'] = utt['speakerid'] + '_' + utt['id'] #utts[i] = utt #lastutt = utt #Filter utterances with repeat in file name (recording was repeated after a wrong utterance) #utts = filterRepeatUtterances(utts) return utts, utts_phoneme_dict
from __future__ import print_function#, unicode_literals import maryclient import codecs import common_utils import argparse if __name__ == '__main__': parser = argparse.ArgumentParser( description='Retrieves pronounciations entries of arbitrary German words (using the TTS software Mary) for a whole word list.') parser.add_argument('-i', '--inputfile', dest='inputfile', help='Process this word list (one per line, utf-8)', type=str, default='') parser.add_argument('-o', '--outputfile', dest='outputfile', help='Export pronouciation entries to this outputfile (one per line, utf-8)', type=str, default='') args = parser.parse_args() mary = maryclient.maryclient() dictionary = {} with codecs.open(args.inputfile, 'r', 'utf-8') as inputfile: for word in inputfile: tokens, phonems = common_utils.getCleanTokensAndPhonemes( word, mary) if len(phonems) != 1: print( 'Warning, MARY did split this word into more than one token:', word, phonems) dictionary[word[:-1]] = ''.join(phonems[0]) with codecs.open(args.outputfile, 'w', 'utf-8') as outputfile: for word in sorted(dictionary): outputfile.write(word+' '+dictionary[word]+'\n')
def getUtterances(ids, use_mary=False, cache_cleaned_sentences = True): '''Loads the corpus and gets python structured object that can be used to export the corpus to a format KALDI understands''' utts= [] cleaned_sentences_cache = {} utts_phoneme_dict = {} print('Reading and parsing TUDA corpus transcriptions',end='',flush=True) lastutt = None for i,myid in enumerate(ids): if i%100 == 0: print('.',end='',flush=True) try: with codecs.open(myid+'.xml','r','utf-8') as myfile: #extract xml meta xml = myfile.read() soup = BeautifulSoup(xml,"lxml") sentence = soup.recording.sentence.string cleaned_sentence = soup.recording.cleaned_sentence.string gender = soup.recording.gender.string age = soup.recording.ageclass.string corpus = soup.recording.corpus.string nativespeaker = soup.recording.muttersprachler.string region = soup.recording.bundesland.string speakerid= soup.recording.speaker_id.string if speakerid is None or speakerid == '': print('ERROR, speakerid not found for', myid) date = getDateFromID(myid) if use_mary: if cache_cleaned_sentences and (cleaned_sentence not in cleaned_sentences_cache): clean_sentence_tokens,token_phonemes = common_utils.getCleanTokensAndPhonemes(cleaned_sentence,mary) cleaned_sentences_cache[cleaned_sentence] = (clean_sentence_tokens,token_phonemes) #print 'cleaning ', cleaned_sentence, ' -> ', clean_sentence_tokens , ' phonemes:', token_phonemes else: clean_sentence_tokens,token_phonemes = cleaned_sentences_cache[cleaned_sentence] if not cache_cleaned_sentences: clean_sentence_tokens,token_phonemes = common_utils.getCleanTokensAndPhonemes(sentence,mary) for token,phoneme_representation in itertools.izip(clean_sentence_tokens,token_phonemes): if token not in utts_phoneme_dict: utts_phoneme_dict[token] = phoneme_representation clean_sentence_tokens = cleaned_sentence.split(' ') utt = {'id':myid.split('/')[-1],'fileids':ids[myid],'sentence':sentence,'clean_sentence_tokens':clean_sentence_tokens, 'speakerid':speakerid,'gender':gender,'age':age,'corpus':corpus,'nativespeaker':nativespeaker,'region':region,'date':date} utts.append(utt) except Exception as err: print('Error in file, omitting', myid) print(err) #Sort utterances by date utts = sorted(utts,key=lambda utt:utt['date']) for utt in utts: utt['kaldi_id'] = utt['speakerid']+'_'+utt['id'] #Filter utterances with repeat in file name (recording was repeated after a wrong utterance) #utts = filterRepeatUtterances(utts) return utts,utts_phoneme_dict