def load_stanford_pcfg_parse(utt, parse, comma_is_pause=False): if utt.words == None: raise SiReError( "No words in utterance! Please load an mlf or txt file first!") tree = parsetrees.stanfordPcfgTree() tree.make_tree(parse) if comma_is_pause == True: leafs = tree.get_leafs(include_punct=[","]) else: leafs = tree.get_leafs() num_w = utt.num_words_no_pau(comma_is_pause) if len(leafs) != num_w: #First we try to see if this is due to differences in how words are #dealt with in parsing and annotation. #Prime example is using 's in e.g. there's for transcription instead of there is. #Parsing splits there's into two whereas in e.g. combilex there's is one word. #If this is the case we split the WORD into two with the 's being a single phoneme #single syllable word. In other cases the contraction straddles two words and #we add a "phony" word which affects contexts but adds no phonemes. utterance_utils.try_split_words(utt) #Update num_w num_w = utt.num_words_no_pau(comma_is_pause) if len(leafs) != num_w: for w in utt.words: print w.id raise SiReError( "Number of leaves ({0}) not equal to number of words ({1})! In utt ({2})!" .format(len(leafs), num_w, utt.id)) #Match each word with parse words = utt.get_words_no_pau(comma_is_pause) for i, word in enumerate(words): l = leafs[i].label.split("-") word.id = l[1] word.pos = l[0] #There should always be a parent word.parent_phrase = leafs[i].parent #But there might not be more than one if word.parent_phrase.parent != None: word.grandparent_phrase = word.parent_phrase.parent else: word.grandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse() #And certainly we might be done here if word.grandparent_phrase.parent in [ None, "xx" ] or word.grandparent_phrase.parent.label == "xx": word.greatgrandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse( ) else: word.greatgrandparent_phrase = word.grandparent_phrase.parent #Now add fake parse for sil, pau and # for word in utt.words: if word.id in utt.phoneme_features.get_sil_phonemes(): word.parent_phrase = parsetrees.get_fake_stanford_pcfg_parse() word.grandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse() word.greatgrandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse( ) word.pos = "sil"
def load_stanford_pcfg_parse(utt, parse, comma_is_pause=False): if utt.words == None: raise SiReError("No words in utterance! Please load an mlf or txt file first!") tree = parsetrees.stanfordPcfgTree() tree.make_tree(parse) if comma_is_pause == True: leafs = tree.get_leafs(include_punct=[","]) else: leafs = tree.get_leafs() num_w = utt.num_words_no_pau(comma_is_pause) if len(leafs) != num_w: #First we try to see if this is due to differences in how words are #dealt with in parsing and annotation. #Prime example is using 's in e.g. there's for transcription instead of there is. #Parsing splits there's into two whereas in e.g. combilex there's is one word. #If this is the case we split the WORD into two with the 's being a single phoneme #single syllable word. In other cases the contraction straddles two words and #we add a "phony" word which affects contexts but adds no phonemes. utterance_utils.try_split_words(utt) #Update num_w num_w = utt.num_words_no_pau(comma_is_pause) if len(leafs) != num_w: for w in utt.words: print w.id raise SiReError("Number of leaves ({0}) not equal to number of words ({1})! In utt ({2})!".format(len(leafs), num_w, utt.id)) #Match each word with parse words = utt.get_words_no_pau(comma_is_pause) for i, word in enumerate(words): l = leafs[i].label.split("-") word.id = l[1] word.pos = l[0] #There should always be a parent word.parent_phrase = leafs[i].parent #But there might not be more than one if word.parent_phrase.parent != None: word.grandparent_phrase = word.parent_phrase.parent else: word.grandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse() #And certainly we might be done here if word.grandparent_phrase.parent in [None, "xx"] or word.grandparent_phrase.parent.label == "xx": word.greatgrandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse() else: word.greatgrandparent_phrase = word.grandparent_phrase.parent #Now add fake parse for sil, pau and # for word in utt.words: if word.id in utt.phoneme_features.get_sil_phonemes(): word.parent_phrase = parsetrees.get_fake_stanford_pcfg_parse() word.grandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse() word.greatgrandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse() word.pos = "sil"
def load_stanford_dependency_parse(utt, parse): if utt.words == None: raise SiReError( "No words in utterance! Please load an mlf or txt file first!") tree = parsetrees.stanfordDependencyTree() tree.make_tree(parse) #As each word is at a node not at a leaf we get the nodes. nodes = tree.get_nodes(utt_sorted=True) if len(nodes) != utt.num_words_no_pau(): #First we try to see if this is due to differences in how words are #dealt with in parsing and annotation. #Prime example is using 's in e.g. there's for transcription instead of there is. #Parsing splits there's into two whereas in e.g. combilex there's is one word. #If this is the case we split the WORD into two with the 's being a single phoneme #single syllable word. In other cases the contraction straddles two words and #we add a "phony" word which affects contexts but adds no phonemes. utterance_utils.try_split_words(utt) if len(nodes) != utt.num_words_no_pau(): for node in nodes: print node.label raise SiReError( "Number of nodes ({0}) not equal to number of words ({1})! In utt ({2})!" .format(len(nodes), utt.num_words_no_pau(), utt.id)) #Match each word with parse for i, word in enumerate(utt.get_words_no_pau()): #As we may have split words the parse contains the id word.id = nodes[i].label #But as we may have punctuation the word itself contains the utt_pos nodes[i].utt_pos = word.pos_in_utt() #There should always be itself word.parent_dependency = nodes[i] #And there should always be a parent word.grandparent_dependency = word.parent_dependency.parent #But there might not be more than one if word.grandparent_dependency.parent != None: word.greatgrandparent_dependency = word.grandparent_dependency.parent else: word.greatgrandparent_dependency = parsetrees.stanfordDependencyTree( ) #Now add empty parse for sil, pau and # for word in utt.words: if word.id in utt.phoneme_features.get_sil_phonemes() + [","]: word.parent_dependency = parsetrees.stanfordDependencyTree() word.grandparent_dependency = parsetrees.stanfordDependencyTree() word.greatgrandparent_dependency = parsetrees.stanfordDependencyTree( )
def load_stanford_dependency_parse(utt, parse): if utt.words == None: raise SiReError("No words in utterance! Please load an mlf or txt file first!") tree = parsetrees.stanfordDependencyTree() tree.make_tree(parse) #As each word is at a node not at a leaf we get the nodes. nodes = tree.get_nodes(utt_sorted=True) if len(nodes) != utt.num_words_no_pau(): #First we try to see if this is due to differences in how words are #dealt with in parsing and annotation. #Prime example is using 's in e.g. there's for transcription instead of there is. #Parsing splits there's into two whereas in e.g. combilex there's is one word. #If this is the case we split the WORD into two with the 's being a single phoneme #single syllable word. In other cases the contraction straddles two words and #we add a "phony" word which affects contexts but adds no phonemes. utterance_utils.try_split_words(utt) if len(nodes) != utt.num_words_no_pau(): for node in nodes: print node.label raise SiReError("Number of nodes ({0}) not equal to number of words ({1})! In utt ({2})!".format(len(nodes), utt.num_words_no_pau(), utt.id)) #Match each word with parse for i, word in enumerate(utt.get_words_no_pau()): #As we may have split words the parse contains the id word.id = nodes[i].label #But as we may have punctuation the word itself contains the utt_pos nodes[i].utt_pos = word.pos_in_utt() #There should always be itself word.parent_dependency = nodes[i] #And there should always be a parent word.grandparent_dependency = word.parent_dependency.parent #But there might not be more than one if word.grandparent_dependency.parent != None: word.greatgrandparent_dependency = word.grandparent_dependency.parent else: word.greatgrandparent_dependency = parsetrees.stanfordDependencyTree() #Now add empty parse for sil, pau and # for word in utt.words: if word.id in utt.phoneme_features.get_sil_phonemes()+[","]: word.parent_dependency = parsetrees.stanfordDependencyTree() word.grandparent_dependency = parsetrees.stanfordDependencyTree() word.greatgrandparent_dependency = parsetrees.stanfordDependencyTree()