示例#1
0
class TreeStats:
    """
    Determine tree-based statistics, such as tree depths, production counts,
    etc.
    """

    def __init__(self, corpus):
        """
        We'll use the Stanford Parser to do the heavy lifting here.
        """
        def n_productions(parse_tree, production):
            """
            Returns the number of productions of type `production` in
            parse_tree.
            """
            productions = list(parse_tree.subtrees(
                filter=lambda t: t.label() == production))
            return len(productions)

        jar = '/usr/local/Cellar/stanford-parser/'
        '3.6.0/libexec/stanford-parser.jar'
        model = '/usr/local/Cellar/stanford-parser/'
        '3.6.0/libexec/stanford-parser-3.6.0-models.jar'
        self.corpus = [corpus] if isinstance(corpus[0], tuple) else corpus
        self.parser = StanfordParser(path_to_jar=jar, path_to_models_jar=model)
        self.stats = []

        parsed_sents = self.parser.tagged_parse_sents(self.corpus)
        self.trees = [t for tree in parsed_sents for t in tree]

        for tree in self.trees:
            self.stats.append({
                'depth': tree.height(),
                'noun_phrases': n_productions(tree, 'NP'),
                'prepositional_phrases': n_productions(tree, 'PP'),
                'sbars': n_productions(tree, 'SBAR'),
                'nonterminals': len(tree.productions()),
            })

    def get_stats(self):
        """
        Combines all the statistics together
        """
        n = len(self.stats)
        max_tree_depth = max(stat['depth'] for stat in self.stats)
        avg_tree_depth = sum(stat['depth'] for stat in self.stats) / n
        avg_noun_phrases = sum(stat['noun_phrases'] for stat in self.stats) / n
        avg_prep_phrases = sum(stat['prepositional_phrases']
                               for stat in self.stats) / n
        avg_sbars = sum(stat['sbars'] for stat in self.stats) / n
        avg_nonterminals = sum(stat['nonterminals'] for stat in self.stats) / n

        return {
            'max_tree_depth': max_tree_depth,
            'avg_tree_depth': avg_tree_depth,
            'avg_noun_phrases': avg_noun_phrases,
            'avg_prepositional_phrases': avg_prep_phrases,
            'avg_sbars': avg_sbars,
            'avg_nonterminals': avg_nonterminals,
        }
示例#2
0
for j in range (0,70159):
	answer = answers[j]
	for token,tag in answer:
		if token == "(":
			stat[j]['l'] = stat[j].get('l',0) + 1
		elif token == ")":
			stat[j]['r'] = stat[j].get('r',0) + 1
nm = []
for item in stat.items():

	if item[1].get('l',0) != item[1].get('r',0):
		nm.append(item[0])

pure_answers = []
for i in range (0,70159):
	if i not in nm:
		pure_answers.append(answers[i])

sample = random.sample(pure_answers,50)
print sample
trees = parser.tagged_parse_sents(pure_answers[0:100])


for tree in trees:
	t =  next(tree)
	# t.draw()
	print t.leaves()