Exemplo n.º 1
0
    def parse_mtree(self):
        if self.lang != 'en':
            raise Exception(
                "MetricalTree parsing only works currently for English text.")

        import metricaltree as mtree
        mtree.set_paths(self.dir_mtree)

        wordtoks = self.wordtokens()
        toks = [wtok.token for wtok in wordtoks]

        pauses = mtree.pause_splitter_tokens(toks)

        #sents = [sent for pause in pauses for sent in pause]
        sents = []
        for pause in pauses:
            sents.extend(mtree.split_sentences_from_tokens(pause))
        parser = mtree.return_parser(self.dir_mtree)
        trees = list(parser.lex_parse_sents(sents, verbose=False))
        stats = parser.get_stats(trees, arto=True, format_pandas=False)
        assert len(stats) == len(wordtoks)

        sents = []
        sent = []
        sent_id = None
        for wTok, wStat in zip(wordtoks, stats):
            if sent_id != wStat['sidx']:
                sent_id = wStat['sidx']
                if sent: sents += [sent]
                sent = []

            sent += [wTok]
            #for k,v in wStat.items():
            #	setattr(wTok,k,v)
            if not hasattr(wTok, 'feats'): wTok.feats = {}
            for k, v in list(wStat.items()):
                if k in mtree.INFO_DO_NOT_STORE: continue
                wTok.feats[k] = v

        if sent: sents += [sent]
        assert len(sents) == len(trees)

        from Sentence import Sentence
        for sent, tree in zip(sents, trees):
            sentobj = Sentence(sent, tree)
            self._sentences += [sentobj]

        # create a normalized stress per line
        import numpy as np
        for line in self.lines():
            wtoks = line.children

            # norm mean
            stresses = [
                wtok.feats['norm_mean'] for wtok in wtoks
                if not np.isnan(wtok.feats['norm_mean'])
            ]
            max_stress = float(max(stresses))
            min_stress = float(min(stresses))
            for wtok in wtoks:
                wtok.feats['norm_mean_line'] = (
                    wtok.feats['norm_mean'] - min_stress) / (
                        max_stress - min_stress) if max_stress else np.nan

            # mean
            stresses = [
                wtok.feats['mean'] for wtok in wtoks
                if not np.isnan(wtok.feats['mean'])
            ]
            min_stress = float(min(stresses))
            diff = 1.0 - min_stress
            for wtok in wtoks:
                wtok.feats['mean_line'] = wtok.feats['mean'] + diff
Exemplo n.º 2
0
	def parse_mtree(self):
		if self.lang!='en': raise Exception("MetricalTree parsing only works currently for English text.")

		import metricaltree as mtree
		mtree.set_paths(self.dir_mtree)

		wordtoks = self.wordtokens()
		toks = [wtok.token for wtok in wordtoks]

		pauses = mtree.pause_splitter_tokens(toks)

		#sents = [sent for pause in pauses for sent in pause]
		sents=[]
		for pause in pauses:
			sents.extend(mtree.split_sentences_from_tokens(pause))
		parser = mtree.return_parser(self.dir_mtree)
		trees = list(parser.lex_parse_sents(sents, verbose=False))
		stats = parser.get_stats(trees,arto=True,format_pandas=False)
		assert len(stats)==len(wordtoks)

		sents = []
		sent = []
		sent_id=None
		for wTok,wStat in zip(wordtoks,stats):
			if sent_id!=wStat['sidx']:
				sent_id=wStat['sidx']
				if sent: sents+=[sent]
				sent=[]

			sent+=[wTok]
			#for k,v in wStat.items():
			#	setattr(wTok,k,v)
			if not hasattr(wTok,'feats'): wTok.feats={}
			for k,v in wStat.items():
				if k in mtree.INFO_DO_NOT_STORE: continue
				wTok.feats[k]=v

		if sent: sents+=[sent]
		assert len(sents) == len(trees)

		from Sentence import Sentence
		for sent,tree in zip(sents,trees):
			sentobj = Sentence(sent, tree)
			self._sentences+=[sentobj]

		# create a normalized stress per line
		import numpy as np
		for line in self.lines():
			wtoks = line.children

			# norm mean
			stresses = [wtok.feats['norm_mean'] for wtok in wtoks if not np.isnan(wtok.feats['norm_mean'])]
			max_stress = float(max(stresses))
			min_stress = float(min(stresses))
			for wtok in wtoks:
				wtok.feats['norm_mean_line']=(wtok.feats['norm_mean']-min_stress)/(max_stress-min_stress) if max_stress else np.nan

			# mean
			stresses = [wtok.feats['mean'] for wtok in wtoks if not np.isnan(wtok.feats['mean'])]
			min_stress = float(min(stresses))
			diff = 1.0 - min_stress
			for wtok in wtoks:
				wtok.feats['mean_line']=wtok.feats['mean'] + diff