示例#1
0
    def load(self):
        
        self.target_nodes = self.config.get('target_nodes', "//phone")
        self.input_attribute = self.config.get('input_attribute', 'pronunciation')
        self.dictionary = self.config.get('dictionary', 'some_dictionary_name')
        
        self.phone_table_file = os.path.join(self.get_location(), 'phones.txt') 

        try:
            self.phone_table = LookupTable(self.phone_table_file, is_phoneset=True)
            self.trained = True
            
        except:
            self.trained = False
示例#2
0
class PhoneticFeatureAdder(SUtteranceProcessor):
    def load(self):
        
        self.target_nodes = self.config.get('target_nodes', "//phone")
        self.input_attribute = self.config.get('input_attribute', 'pronunciation')
        self.dictionary = self.config.get('dictionary', 'some_dictionary_name')
        
        self.phone_table_file = os.path.join(self.get_location(), 'phones.txt') 

        try:
            self.phone_table = LookupTable(self.phone_table_file, is_phoneset=True)
            self.trained = True
            
        except:
            self.trained = False
    
    
    def train(self, corpus, text_corpus):
        dict_location = os.path.join(self.voice_resources.path[c.LANG], 'labelled_corpora', self.dictionary)
        assert os.path.isdir(dict_location),'Dictionary directory %s doesn\'t exist'%(dict_location)
        original_phonetable_file = glob.glob(os.path.join(dict_location, '*.table'))[0] ## take first 
        shutil.copy(original_phonetable_file, self.phone_table_file)
    
        self.load()    
    
    def process_utterance(self, utt):
        for node in utt.xpath(self.target_nodes):
            assert node.has_attribute(self.input_attribute)
            phone = node.get(self.input_attribute)
            
            for feature in self.phone_table.fields:
                value = self.phone_table.lookup(phone, field=feature)
                node.set(feature, value)
    def do_training(self, corpus, text_corpus):

        dict_location = os.path.join(self.voice_resources.path[c.LANG],
                                     'labelled_corpora', self.dictionary)

        ## phoneset
        phonetable_files = glob.glob(os.path.join(dict_location, '*.table'))
        if phonetable_files == []:
            sys.exit('Cannot find any phone table files at %s' %
                     (os.path.join(dict_location, '*.table')))
        phonetable_file = phonetable_files[0]  ## take first
        shutil.copy(phonetable_file, self.phoneset_fname)
        ## load phoneset now for converting lexicon:
        self.phoneset = LookupTable(self.phoneset_fname, is_phoneset=True)

        ## letter pronunciations
        letter_file = os.path.join(dict_location, 'letter.names')
        assert os.path.isfile(letter_file)
        shutil.copy(letter_file, self.letter_fname)
        self.load_letternames()  # populate self.letternames

        ## lexicon
        dict_files = [f for f in glob.glob(os.path.join(dict_location, '*')) \
                    if f.endswith('.out')]

        ## exclude cmudict extensions:  ## or f.endswith('.scm') ]
        ## glob doesn't support {} for .{out,scm}

        assert dict_files != [], 'No lexicon files found at %s' % (
            dict_location)
        self.convert_lexicon(dict_files)

        ## onsets
        self.count_onsets_and_codas()
        onset_strings = [' '.join(onset) for onset in self.onsets.keys()]
        writelist(onset_strings, self.onsets_fname)

        ## G2P
        train_file = os.path.join(self.get_training_dir(), 'train_data.txt')
        self.make_sequitur_train_data(train_file)
        self.train_sequitur_g2p(train_file)

        ## save it also globally for posterity:-
        if os.path.isdir(self.component_path):
            shutil.rmtree(self.component_path)
        shutil.copytree(self.model_dir, self.component_path)
    def verify(self, voice_resources):
        self.voice_resources = voice_resources

        # self.target_nodes = self.config.get('target_nodes', "//token[@token_class='word']")
        # self.input_attribute = self.config.get('input_attribute', 'norm_text')
        # self.output_attribute = self.config.get('output_attribute', 'pronunciation')

        ## --- find and check required binaries ---
        self.lts_tool = os.path.join(self.voice_resources.path[c.BIN],
                                     'g2p.py')
        tool_executable = os.access(self.lts_tool, os.X_OK)
        if not tool_executable:
            sys.exit('LTS tool %s doesn\'t exist or not executable' %
                     (self.lts_tool))

        ## If this component has been trained for a previous model and stored globally, this is
        ## where it will be:--
        lex_config_name = 'lexicon-%s_sequitur_LTS_ntrain%s_gramlen%s_phon%s_lett%s'%(self.dictionary, \
                    self.lts_ntrain, self.lts_gram_length, self.max_graphone_phones, self.max_graphone_letters)
        self.component_path = self.voice_resources.get_dirname(lex_config_name,
                                                               c.COMPONENT,
                                                               create=False)

        ## Try loading model:  # similar to acoustic model code--refactor and put this in UtteranceProcessor?
        # Specific stuff would be the names of component of trained model.
        self.trained = True
        self.model_dir = os.path.join(
            self.get_location())  ## TODO = self.get_location() ?

        if not os.path.isdir(self.model_dir):
            self.trained = False

        ## verify all the parts needed are present: if the model files exists, count it as trained:
        self.lexicon_fname = os.path.join(self.model_dir, 'lexicon.txt')
        self.lts_fname = os.path.join(self.model_dir, 'lts.model')
        self.phoneset_fname = os.path.join(self.model_dir, 'phones.table')
        self.onsets_fname = os.path.join(self.model_dir, 'onsets.txt')
        self.letter_fname = os.path.join(self.model_dir, 'letter.names')

        complete = True
        for component in [self.lexicon_fname, self.lts_fname, self.phoneset_fname, \
                                                    self.onsets_fname, self.letter_fname]:
            if not os.path.isfile(component):
                complete = False
                self.trained = False

        if self.trained:
            self.load_lexicon()  # populate self.entries
            self.load_onsets()  # populate self.onsets
            self.phoneset = LookupTable(self.phoneset_fname, is_phoneset=True)
            self.load_letternames()  # populate self.letternames


#         self.extra_lex_entries = self.config.get('extra_lex_entries', '')
#
        extra_lex = os.path.join(self.model_dir, 'extra_lex.txt')
        if os.path.isfile(extra_lex):
            print '    extra exists --> loading it!'
            self.load_extra_lexicon(extra_lex)

        ## Add locations for sequitur g2p to pythonpath:
        tooldir = os.path.join(self.voice_resources.path[c.BIN], '..')
        sitepackages_dir = glob.glob(
            tooldir + '/lib*/python*/site-packages')  ## lib vs lib64?
        assert len(sitepackages_dir) > 0
        sitepackages_dir = sitepackages_dir[0]

        ## Prepend this to relevant system calls -- using sequitur via python
        ## would obviously be a lot neater.
        self.g2p_path = 'export PYTHONPATH=%s:%s ; ' % (
            sitepackages_dir, os.path.join(tooldir, 'g2p'))
class Lexicon(SUtteranceProcessor):

    def __init__(self, processor_name='lexicon', target_nodes="//token", \
                target_attribute='text', part_of_speech_attribute='pos', child_node_type='segment', output_attribute='pronunciation', \
                class_attribute='token_class', word_classes=['word'], probable_pause_classes=['punctuation', c.TERMINAL], \
                possible_pause_classes=['space'], \
                dictionary='some_dictionary_name', backoff_pronunciation='axr',lts_variants=1,\
                lts_ntrain=0, lts_gram_length=3, max_graphone_letters=2, max_graphone_phones=2):

        self.processor_name = processor_name
        self.target_nodes = target_nodes
        self.target_attribute = target_attribute
        self.part_of_speech_attribute = part_of_speech_attribute
        self.child_node_type = child_node_type
        self.output_attribute = output_attribute
        self.class_attribute = class_attribute
        self.word_classes = word_classes
        self.probable_pause_classes = probable_pause_classes
        self.possible_pause_classes = possible_pause_classes

        # Lexicon
        self.dictionary = dictionary  # used for training
        self.backoff_pronunciation = backoff_pronunciation  # use this if lookup and LTS fail for whatever reason...
        self.lts_variants = lts_variants
        #
        ## Settings for LTS training:
        self.lts_ntrain = lts_ntrain  ## train on n words -- 0 means all.
        self.lts_gram_length = lts_gram_length  # 1: context independent graphones
        self.max_graphone_letters = max_graphone_letters
        self.max_graphone_phones = max_graphone_phones

        super(Lexicon, self).__init__()

        self.parallelisable = False  ## poor parallelisation due to sequitur (## see: http://stackoverflow.com/questions/20727375/multiprocessing-pool-slower-than-just-using-ordinary-functions)

    def verify(self, voice_resources):
        self.voice_resources = voice_resources

        # self.target_nodes = self.config.get('target_nodes', "//token[@token_class='word']")
        # self.input_attribute = self.config.get('input_attribute', 'norm_text')
        # self.output_attribute = self.config.get('output_attribute', 'pronunciation')

        ## --- find and check required binaries ---
        self.lts_tool = os.path.join(self.voice_resources.path[c.BIN],
                                     'g2p.py')
        tool_executable = os.access(self.lts_tool, os.X_OK)
        if not tool_executable:
            sys.exit('LTS tool %s doesn\'t exist or not executable' %
                     (self.lts_tool))

        ## If this component has been trained for a previous model and stored globally, this is
        ## where it will be:--
        lex_config_name = 'lexicon-%s_sequitur_LTS_ntrain%s_gramlen%s_phon%s_lett%s'%(self.dictionary, \
                    self.lts_ntrain, self.lts_gram_length, self.max_graphone_phones, self.max_graphone_letters)
        self.component_path = self.voice_resources.get_dirname(lex_config_name,
                                                               c.COMPONENT,
                                                               create=False)

        ## Try loading model:  # similar to acoustic model code--refactor and put this in UtteranceProcessor?
        # Specific stuff would be the names of component of trained model.
        self.trained = True
        self.model_dir = os.path.join(
            self.get_location())  ## TODO = self.get_location() ?

        if not os.path.isdir(self.model_dir):
            self.trained = False

        ## verify all the parts needed are present: if the model files exists, count it as trained:
        self.lexicon_fname = os.path.join(self.model_dir, 'lexicon.txt')
        self.lts_fname = os.path.join(self.model_dir, 'lts.model')
        self.phoneset_fname = os.path.join(self.model_dir, 'phones.table')
        self.onsets_fname = os.path.join(self.model_dir, 'onsets.txt')
        self.letter_fname = os.path.join(self.model_dir, 'letter.names')

        complete = True
        for component in [self.lexicon_fname, self.lts_fname, self.phoneset_fname, \
                                                    self.onsets_fname, self.letter_fname]:
            if not os.path.isfile(component):
                complete = False
                self.trained = False

        if self.trained:
            self.load_lexicon()  # populate self.entries
            self.load_onsets()  # populate self.onsets
            self.phoneset = LookupTable(self.phoneset_fname, is_phoneset=True)
            self.load_letternames()  # populate self.letternames


#         self.extra_lex_entries = self.config.get('extra_lex_entries', '')
#
        extra_lex = os.path.join(self.model_dir, 'extra_lex.txt')
        if os.path.isfile(extra_lex):
            print '    extra exists --> loading it!'
            self.load_extra_lexicon(extra_lex)

        ## Add locations for sequitur g2p to pythonpath:
        tooldir = os.path.join(self.voice_resources.path[c.BIN], '..')
        sitepackages_dir = glob.glob(
            tooldir + '/lib*/python*/site-packages')  ## lib vs lib64?
        assert len(sitepackages_dir) > 0
        sitepackages_dir = sitepackages_dir[0]

        ## Prepend this to relevant system calls -- using sequitur via python
        ## would obviously be a lot neater.
        self.g2p_path = 'export PYTHONPATH=%s:%s ; ' % (
            sitepackages_dir, os.path.join(tooldir, 'g2p'))

    def load_letternames(self):
        data = readlist(self.letter_fname)
        self.letternames = {}
        for line in data:
            line = line.strip(' \n')
            letter, pron = re.split('\s+', line, maxsplit=1)
            self.letternames[letter] = pron

    def convert_lexicon(self, files, format='festival'):

        print '     convert lexicon...'
        entries = {}
        seen_tags = {}  ## for reporting
        if format == 'festival':
            for line in fileinput.input(
                    files, openhook=fileinput.hook_encoded("utf8")):
                line = line.strip(' \n')
                if line.startswith(';') or line == '' or line == 'MNCL':
                    continue  ## ignore Scheme comment line and empty lines

                (headword, tags, pronun) = self.read_festival_lexentry(line)

                if headword not in entries:
                    entries[headword] = []
                entries[headword].append([tags, pronun])
                seen_tags[tags] = ''
        else:
            sys.exit('Unknown lexicon format: %s' % (format))

        print 'Tags in lexicon: '
        print seen_tags.keys()

        f = codecs.open(self.lexicon_fname, 'w', encoding='utf8')
        for head_word in sorted(entries.keys()):
            for (tag, pron) in entries[head_word]:
                f.write('%s\t%s\t%s\n' % (head_word, tag, pron))
        f.close()

        self.entries = entries

    def load_lexicon(self):

        assert os.path.isfile(self.lexicon_fname)
        items = readlist(self.lexicon_fname)
        self.entries = {}
        for item in items:
            (head, tag, pron) = item.split('\t')
            tag = tag.split(',')
            if head not in self.entries:
                self.entries[head] = []
            self.entries[head].append((tag, pron))

    def load_extra_lexicon(self, extra_lex):

        assert os.path.isfile(extra_lex), 'not file: ' + extra_lex
        items = readlist(extra_lex)
        for item in items:
            if item.startswith('#') or re.match('\A\s*\Z', item):
                continue
            (head, tag, pron) = item.split('\t')
            tag = tag.split(',')
            if '|' not in pron:
                pron = self.syllabify(pron)
            if head not in self.entries:
                self.entries[head] = []
            self.entries[head].append((tag, pron))

    def load_onsets(self):
        onsets = readlist(self.onsets_fname)
        onsets = [tuple(line.split(' ')) for line in onsets]
        self.onsets = dict(zip(onsets, onsets))

    def process_utterance(self, utt):
        for node in utt.xpath(self.target_nodes):
            assert node.has_attribute(self.class_attribute)
            assert node.has_attribute(self.target_attribute)

            current_class = node.attrib[self.class_attribute]

            if current_class in self.word_classes:
                word = node.attrib[self.target_attribute]
                pos = node.attrib.get(self.part_of_speech_attribute,
                                      None)  # default to None
                (pronunciation,
                 method) = self.get_phonetic_segments(word, part_of_speech=pos)
                node.set('phones_from', method)
                NodeProcessors.add_syllable_structure(node, pronunciation, syllable_delimiter='|', syllable_tag='syllable', \
                                phone_tag='segment', pronunciation_attribute='pronunciation', stress_attribute='stress')
            elif current_class in self.probable_pause_classes:
                pronunciation = c.PROB_PAUSE  # [c.PROB_PAUSE]
                child = Element('segment')
                child.set('pronunciation', pronunciation)
                node.add_child(child)
            elif current_class in self.possible_pause_classes:
                pronunciation = c.POSS_PAUSE  # [c.POSS_PAUSE]
                child = Element('segment')
                child.set('pronunciation', pronunciation)
                node.add_child(child)
            else:
                sys.exit(
                    'Class "%s" not in any of word_classes, probable_pause_classes, possible_pause_classes'
                )

    def get_phonetic_segments(self, word, part_of_speech=None):

        word = word.lower()
        word = word.strip("'\" ;,")

        initialism_patt = '\A([a-z]\.)+\Z'
        if re.match(initialism_patt, word):
            pronunciation = self.get_initialism(word)
            method = 'letter_prons'
        elif word in self.entries:
            method = 'lex'
            if len(self.entries[word]) == 1:  ## unique, no disambig necessary
                tag, pronunciation = self.entries[word][
                    0]  ## for now take first
            else:
                ## filter ambiguous pronunciations by first part of tag (POS):
                ## if there *is* no POS, take first in list:
                if not part_of_speech:
                    print 'WARNING: no pos tag to disambiguate pronunciation of "%s" -- take first entry in lexicon' % (
                        word)
                    tag, pronunciation = self.entries[word][0]  #take first
                else:
                    wordpos = part_of_speech.lower()  # node.attrib['pos']
                    filtered = [(tag, pron) for (tag,pron) in self.entries[word] \
                                                                 if tag[0] == wordpos]
                    if len(filtered) == 0:
                        tag, pronunciation = self.entries[word][
                            0]  #if no POS matches, take first anyway
                    else:
                        tag, pronunciation = filtered[
                            0]  ## take first matching filtered dictionary entry

        else:
            if self.lts_variants == 1:
                pronunciation = self.get_oov_pronunciation(word)
            else:
                pronunciation = self.get_nbest_oov_pronunciations(
                    word, self.lts_variants)
            if pronunciation != None:
                pronunciation = self.syllabify(pronunciation)
                method = 'lts'
            else:
                pronunciation = self.backoff_pronunciation
                method = 'default'

        return (pronunciation, method)

    def count_onsets_and_codas(self):
        print '     count onsets and codas...'
        onsets = {}
        codas = {}
        for (entry, prons) in self.entries.items():

            for (tag, pron) in prons:

                pron = re.sub(
                    '\d', '',
                    pron)  ## remove stress marks so we can look up vowels
                sylls = pron.split(' | ')
                for syll in sylls:
                    phones = syll.split(' ')
                    vowel_index = [i for (i,phone) in enumerate(phones) \
                            if self.phoneset.lookup(phone, field='vowel_cons')=='vowel']
                    if len(vowel_index) > 1:
                        print 'Multiple vowels found in syll %s in an entry for %s' % (
                            syll, entry)
                        continue
                    if len(vowel_index) < 1:
                        print 'No vowels found in syll %s in an entry for %s' % (
                            syll, entry)
                        continue
                    i = vowel_index[0]
                    onset = tuple(phones[:i])
                    coda = tuple(phones[i + 1:])
                    if onset not in onsets:
                        onsets[onset] = 0
                    onsets[onset] += 1
                    if coda not in codas:
                        codas[coda] = 0
                    codas[coda] += 1
        self.onsets = onsets
        self.codas = codas

    def get_initialism(self, form):
        letters = form.lower().strip(' .').split('.')
        pronunciation = []
        for letter in letters:
            pronunciation.append(self.letternames[letter])
        pronunciation = ' | '.join(pronunciation)
        return pronunciation

    def syllabify(self, phonestring):
        '''
        Syllabify with maximum legal (=observed) onset.
        Take "e g z a1 m" 
        return "e g | z a1 m"
        '''
        assert '|' not in phonestring
        plain = re.sub(
            '\d', '',
            phonestring)  ## remove stress marks so we can look up vowels
        plainphones = plain.split(' ')
        phones = phonestring.split(' ')
        vowel_indexes = [i for (i,phone) in enumerate(plainphones) \
                            if self.phoneset.lookup(phone, field='vowel_cons')=='vowel']

        if len(vowel_indexes
               ) > 0:  ## else add nothing to phones and return that.

            start = vowel_indexes[0] + 1

            for end in vowel_indexes[1:]:

                if start == end:  ## juncture between 2 vowels as in 'buyer'
                    best_split = start
                else:
                    split_scores = []
                    for split in range(start, end):
                        first_part = tuple(plainphones[start:split])
                        second_part = tuple(plainphones[split:end])

                        ## Take maximum legal onset:
                        if second_part in self.onsets:
                            score = len(second_part)
                        else:
                            score = -1

                        ## Older version: score is sum of onset and coda freqs:
                        # score = self.codas.get(first_part, 0) + self.onsets.get(second_part, 0)

                        split_scores.append((score, split))
                    split_scores.sort()

                    best_split = split_scores[-1][1]
                phones[best_split] = '| ' + phones[best_split]

                start = end + 1

        return ' '.join(phones)

    def do_training(self, corpus, text_corpus):

        dict_location = os.path.join(self.voice_resources.path[c.LANG],
                                     'labelled_corpora', self.dictionary)

        ## phoneset
        phonetable_files = glob.glob(os.path.join(dict_location, '*.table'))
        if phonetable_files == []:
            sys.exit('Cannot find any phone table files at %s' %
                     (os.path.join(dict_location, '*.table')))
        phonetable_file = phonetable_files[0]  ## take first
        shutil.copy(phonetable_file, self.phoneset_fname)
        ## load phoneset now for converting lexicon:
        self.phoneset = LookupTable(self.phoneset_fname, is_phoneset=True)

        ## letter pronunciations
        letter_file = os.path.join(dict_location, 'letter.names')
        assert os.path.isfile(letter_file)
        shutil.copy(letter_file, self.letter_fname)
        self.load_letternames()  # populate self.letternames

        ## lexicon
        dict_files = [f for f in glob.glob(os.path.join(dict_location, '*')) \
                    if f.endswith('.out')]

        ## exclude cmudict extensions:  ## or f.endswith('.scm') ]
        ## glob doesn't support {} for .{out,scm}

        assert dict_files != [], 'No lexicon files found at %s' % (
            dict_location)
        self.convert_lexicon(dict_files)

        ## onsets
        self.count_onsets_and_codas()
        onset_strings = [' '.join(onset) for onset in self.onsets.keys()]
        writelist(onset_strings, self.onsets_fname)

        ## G2P
        train_file = os.path.join(self.get_training_dir(), 'train_data.txt')
        self.make_sequitur_train_data(train_file)
        self.train_sequitur_g2p(train_file)

        ## save it also globally for posterity:-
        if os.path.isdir(self.component_path):
            shutil.rmtree(self.component_path)
        shutil.copytree(self.model_dir, self.component_path)

    def make_sequitur_train_data(self, train_file):
        '''Write entries to file for training g2p, append stress to vowel symbols'''
        lines = []
        for (head, entry) in self.entries.items():
            for (tags, pronun) in entry:
                train_phones = pronun.replace(' | ', ' ').split(
                    ' ')  ## list of phones w/o syllab
                line = ' '.join([head] + train_phones) + '\n'
                lines.append(line)

        if self.lts_ntrain > 0:
            lines = lines[:self.lts_ntrain]

        f = codecs.open(train_file, 'w', encoding='utf8')
        for line in lines:
            f.write(line)
        f.close()
        print 'Wrote %s' % (train_file)

    def train_sequitur_g2p(self, train_file):
        '''Currently use system call -- TODO: keep this all in python?'''

        lts_model = self.lts_fname
        print 'Training LTS with sequitur...'
        ## train unigram model:
        n = 1
        comm = '%s %s --train %s -s 1,%s,1,%s --devel 5%% --encoding utf8 --write-model %s_%s > %s.log'%(self.g2p_path, \
                            self.lts_tool, train_file, self.max_graphone_letters, \
                            self.max_graphone_phones, lts_model, n, lts_model)
        print comm
        os.system(comm)
        n += 1
        while n <= self.lts_gram_length:
            comm = '%s %s --model %s_%s --ramp-up --train %s --devel 5%% --encoding utf8 --write-model %s_%s >> %s.log' % (
                self.g2p_path, self.lts_tool, lts_model, n - 1, train_file,
                lts_model, n, lts_model)
            print comm
            os.system(comm)
            n += 1
        shutil.copy('%s_%s' % (lts_model, self.lts_gram_length), lts_model)
        self.lts_model = lts_model

    def get_nbest_oov_pronunciations(self, word, nbest):
        '''return n best, sep by sil'''

        word = word.lower()
        word = self.strip_space_and_punc(
            word
        )  ## strip apostrophe and also other punc that gets into KY's words (Poirot'',)
        #escaped_word = "'" + word.replace("'", "'\\''") + "'" ## escape ' for shell command
        #escaped_word = "'" + word.replace("'", "") + "'"  ## strip apostrophe
        escaped_word = "'" + word + "'"


        comm = '%s echo %s | %s  --model %s  --variants-number %s --encoding utf8 --apply -'%(self.g2p_path,  \
                                                    escaped_word, self.lts_tool, self.lts_fname, nbest)

        pronun = subprocess.check_output(comm,
                                         shell=True,
                                         stderr=subprocess.STDOUT)
        if 'failed to convert' in pronun:
            print comm
            print 'WARNING: couldnt run LTS for %s' % (word)

        ## remove the 'stack usage' output line -- its position varies:
        pronun = unicodedata.normalize('NFKD', pronun.decode('utf-8'))
        ## ^----- 2015-11-4: moved this line back from c.440

        pronun = pronun.strip(' \n').split('\n')
        print pronun

        ## deal with this, but TODO: work out long-term solution --
        assert len(pronun) >= 2, str(
            pronun)  ## ==   -->   >=     to handle extra warnings
        if type(word) == str:
            word = word.decode('utf-8')
        normalised_word = unicodedata.normalize('NFKD', word)
        real_pronuns = []
        for line in pronun:
            if 'stack usage' not in line and normalised_word in line:  ## word in line   added to reject warnings
                real_pronuns.append(line)

        word = unicodedata.normalize('NFKD', word)
        clean_pronuns = []
        for line in real_pronuns:
            (outword, number, score, pronun) = re.split('\s+',
                                                        line,
                                                        maxsplit=3)
            outword = unicodedata.normalize('NFKD', outword.decode('utf-8'))
            if type(word) == str:
                word = word.decode('utf-8')
            assert outword == word, 'don\'t match: %s and %s' % (outword, word)
            ## sequitur seems to return decomposed forms

            clean_pronuns.append(pronun)
        clean_pronuns = ' sil '.join(clean_pronuns)
        return clean_pronuns

    def get_oov_pronunciation(self, word):
        '''Currently use system call -- TODO: keep this all in python?'''

        word = word.lower()

        word = self.strip_space_and_punc(
            word
        )  ## strip apostrophe and also other punc that gets into KY's words (Poirot'',)
        #escaped_word = "'" + word.replace("'", "'\\''") + "'" ## escape ' for shell command
        #escaped_word = "'" + word.replace("'", "") + "'"  ## strip apostrophe
        escaped_word = "'" + word + "'"

        comm = '%s echo %s | %s  --model %s --encoding utf8 --apply -'%(self.g2p_path,  \
                                                    escaped_word, self.lts_tool, self.lts_fname)

        pronun = subprocess.check_output(comm.encode('utf8'),
                                         shell=True,
                                         stderr=subprocess.STDOUT)
        if 'failed to convert' in pronun:
            print comm
            print 'WARNING: couldnt run LTS for %s' % (word)
            return None

        ## remove the 'stack usage' output line -- its position varies:
        pronun = unicodedata.normalize('NFKD', pronun.decode('utf-8'))
        ## ^----- 2015-11-4: moved this line back from c.440

        pronun = pronun.strip(' \n').split('\n')

        ## form of returned pronuncations is different when warnings are given:

        ## ]'stack usage:  415', 'androcles\ta1 n d r @0 k @0 lw z'] ... becomes:

        ## ['/afs/inf.ed.ac.uk/group/cstr/projects/blizzard_entries/blizzard2015/tool/Ossian//tools/bin/g2p.py:37: DeprecationWarning: the sets module is deprecated', '  import math, sets, sys', 'stack usage:  415', 'androcles\ta1 n d r @0 k @0 lw z']

        ## deal with this, but TODO: work out long-term solution --
        assert len(pronun) >= 2, str(
            pronun)  ## ==   -->   >=     to handle extra warnings
        if type(word) == str:
            word = word.decode('utf-8')
        normalised_word = unicodedata.normalize('NFKD', word)
        for line in pronun:
            if 'stack usage' not in line and normalised_word in line:  ## word in line   added to reject warnings
                pronun = line

        (outword, pronun) = re.split('\s+', pronun, maxsplit=1)
        outword = unicodedata.normalize('NFKD', outword)
        #        word = unicodedata.normalize('NFKD', word.decode('utf-8'))
        if type(word) == str:
            word = word.decode('utf-8')
        word = unicodedata.normalize('NFKD', word)

        assert outword == word, 'don\'t match: %s and %s' % (outword, word)
        ## sequitur seems to return decomposed forms
        return pronun

    def strip_space_and_punc(self, token):
        '''Use regex to match unicode properties to strip punctuation and space.'''
        space_or_punc = '[\p{Z}||\p{C}||\p{P}||\p{S}]'
        return regex.sub(space_or_punc, '', token)

    def read_festival_lexentry(self, string):

        ## TODO: handle simple pronunciations better

        string = re.sub('(\A\s*\(\s*|\s*\)\s*\Z)', '',
                        string)  ## strip outer brackets

        entry = re.split(
            '(\A"[^"]+\")',
            string)  ## brackets to return chunks as well as splits

        entry = [chunk for chunk in entry if chunk != '']  ## filter intial ''
        assert len(entry) == 2, entry
        word, pronun = entry

        word = word.strip('"')
        pronun = pronun.strip(' ')
        ## tag might be a string or bracketed sequence:

        if pronun[0] == '(':
            pronun = re.split('\)\s*\(', pronun,
                              maxsplit=1)  # re.split('(\([^)]+\))',pronun)
        else:
            pronun = re.split('\s+', pronun, maxsplit=1)
        pronun = [chunk for chunk in pronun if chunk != '']

        assert len(pronun) == 2, pronun
        tag, all_syllables = pronun

        tag = tag.strip('() ')

        pronun = []

        ## is it a simple pronunciation (later addition, no syll and stress appended to vowel)? -- :
        if all_syllables.count(')') == 1 and all_syllables.count('(') == 1:
            phones = all_syllables.strip('()')
            stress = '1'  ## dummy
            pronun.append(phones)

        else:

            syllables = re.split('\)\s*\(', all_syllables)
            syllables = [syll.strip(' ()') for syll in syllables]

            for syll in syllables:
                phones, stress = re.split('\)\s*', syll)
                phones = phones.split(' ')
                stressed_phones = []
                for phone in phones:
                    if self.phoneset.lookup(phone,
                                            field='vowel_cons') == 'vowel':
                        phone = phone + stress
                    stressed_phones.append(phone)
                pronun.append(' '.join(stressed_phones))
        pronun = ' | '.join(pronun)

        ## parse tag into either [POS] or [POS, disambig] or [POS, disambig, variant_tag]
        if ' ' in tag:
            tag = tag.split(' ')
            assert len(tag) == 2  # in [2,3]
        else:
            tag = [tag]
        tag = ','.join(tag)  ## tuple(tag)

        return (word, tag, pronun)