def parse_mtree(self): if self.lang != 'en': raise Exception( "MetricalTree parsing only works currently for English text.") import metricaltree as mtree mtree.set_paths(self.dir_mtree) wordtoks = self.wordtokens() toks = [wtok.token for wtok in wordtoks] pauses = mtree.pause_splitter_tokens(toks) #sents = [sent for pause in pauses for sent in pause] sents = [] for pause in pauses: sents.extend(mtree.split_sentences_from_tokens(pause)) parser = mtree.return_parser(self.dir_mtree) trees = list(parser.lex_parse_sents(sents, verbose=False)) stats = parser.get_stats(trees, arto=True, format_pandas=False) assert len(stats) == len(wordtoks) sents = [] sent = [] sent_id = None for wTok, wStat in zip(wordtoks, stats): if sent_id != wStat['sidx']: sent_id = wStat['sidx'] if sent: sents += [sent] sent = [] sent += [wTok] #for k,v in wStat.items(): # setattr(wTok,k,v) if not hasattr(wTok, 'feats'): wTok.feats = {} for k, v in list(wStat.items()): if k in mtree.INFO_DO_NOT_STORE: continue wTok.feats[k] = v if sent: sents += [sent] assert len(sents) == len(trees) from Sentence import Sentence for sent, tree in zip(sents, trees): sentobj = Sentence(sent, tree) self._sentences += [sentobj] # create a normalized stress per line import numpy as np for line in self.lines(): wtoks = line.children # norm mean stresses = [ wtok.feats['norm_mean'] for wtok in wtoks if not np.isnan(wtok.feats['norm_mean']) ] max_stress = float(max(stresses)) min_stress = float(min(stresses)) for wtok in wtoks: wtok.feats['norm_mean_line'] = ( wtok.feats['norm_mean'] - min_stress) / ( max_stress - min_stress) if max_stress else np.nan # mean stresses = [ wtok.feats['mean'] for wtok in wtoks if not np.isnan(wtok.feats['mean']) ] min_stress = float(min(stresses)) diff = 1.0 - min_stress for wtok in wtoks: wtok.feats['mean_line'] = wtok.feats['mean'] + diff
def parse_mtree(self): if self.lang!='en': raise Exception("MetricalTree parsing only works currently for English text.") import metricaltree as mtree mtree.set_paths(self.dir_mtree) wordtoks = self.wordtokens() toks = [wtok.token for wtok in wordtoks] pauses = mtree.pause_splitter_tokens(toks) #sents = [sent for pause in pauses for sent in pause] sents=[] for pause in pauses: sents.extend(mtree.split_sentences_from_tokens(pause)) parser = mtree.return_parser(self.dir_mtree) trees = list(parser.lex_parse_sents(sents, verbose=False)) stats = parser.get_stats(trees,arto=True,format_pandas=False) assert len(stats)==len(wordtoks) sents = [] sent = [] sent_id=None for wTok,wStat in zip(wordtoks,stats): if sent_id!=wStat['sidx']: sent_id=wStat['sidx'] if sent: sents+=[sent] sent=[] sent+=[wTok] #for k,v in wStat.items(): # setattr(wTok,k,v) if not hasattr(wTok,'feats'): wTok.feats={} for k,v in wStat.items(): if k in mtree.INFO_DO_NOT_STORE: continue wTok.feats[k]=v if sent: sents+=[sent] assert len(sents) == len(trees) from Sentence import Sentence for sent,tree in zip(sents,trees): sentobj = Sentence(sent, tree) self._sentences+=[sentobj] # create a normalized stress per line import numpy as np for line in self.lines(): wtoks = line.children # norm mean stresses = [wtok.feats['norm_mean'] for wtok in wtoks if not np.isnan(wtok.feats['norm_mean'])] max_stress = float(max(stresses)) min_stress = float(min(stresses)) for wtok in wtoks: wtok.feats['norm_mean_line']=(wtok.feats['norm_mean']-min_stress)/(max_stress-min_stress) if max_stress else np.nan # mean stresses = [wtok.feats['mean'] for wtok in wtoks if not np.isnan(wtok.feats['mean'])] min_stress = float(min(stresses)) diff = 1.0 - min_stress for wtok in wtoks: wtok.feats['mean_line']=wtok.feats['mean'] + diff