def extract_feature_utterances(filenames, feature, speaker=None, cutoff=0):
    parser = MorParser("{http://www.talkbank.org/ns/talkbank}")
    corpus = itertools.chain(*(parser.parse(i) for i in filenames))
    if feature == "pos":
        f = rewriter
    elif feature == "word":
        f = lambda x: x.word
    utterances = [[f(w) for w in u[1]]
                      for u in corpus
                      if ((u[0] == speaker or speaker is None)
                          and len(u[1]) >= cutoff)]

    return utterances
示例#2
0
def xml_to_tagfile(filename):
    parser = MorParser("{http://www.talkbank.org/ns/talkbank}")
    corpus = parser.parse(filename)
    for speaker, tokens in corpus:
        yield speaker, [rewriter(t) for t in tokens]