def token_freq(trees): """ Collect token frequency statistics from trees >>> trees = [(1, (1, (2, 'a'), (3, 'b')), (1, (1, 'c'), (2, 'd'))), \ (1, (1, (2, 'b'), (3, 'b')), (1, (1, 'c'), (2, 'a')))] >>> token_freq(trees) Counter({'b': 3, 'a': 2, 'c': 2, 'd': 1}) """ counter = Counter() for tree in trees: leaves = get_leaves_with_labels(tree) counter += Counter([token for token, label in leaves]) return counter
def collect_nodes(trees): """ Collect node information(token, left child, right child, label) of trees by starting from lower part of trees and moving to the top Param: ------ trees: list of tree Return: ------ list of tuple, (token, left child token, right child token, label) >>> from ptb import parse >>> t1 = parse("(4 (4 (2 A) (4 (3 (3 warm) (2 ,)) (3 funny))) (3 (2 ,) (3 (4 (4 engaging) (2 film)) (2 .))))") >>> t2 = parse("(0 (0 (2 A) (0 (0 (0 boring) (2 ,)) (0 bad))) (1 (2 ,) (1 (1 (1 unsatisfactory) (2 film)) (2 .))))") >>> t3 = parse("(2 film)") # some repeatition >>> data = collect_nodes([t1, t2, t3]) >>> len(data) 24 >>> data[-1] ((('A', (('boring', ','), 'bad')), (',', (('unsatisfactory', 'film'), '.'))), ('A', (('boring', ','), 'bad')), (',', (('unsatisfactory', 'film'), '.')), 0) >>> data[0] ('funny', None, None, 3) >>> nodes = collect_nodes([t1]) >>> len(nodes) 14 >>> nodes [('funny', None, None, 3), (',', None, None, 2), ('.', None, None, 2), ('engaging', None, None, 4), ('film', None, None, 2), ('warm', None, None, 3), ('A', None, None, 2), (('warm', ','), 'warm', ',', 3), (('engaging', 'film'), 'engaging', 'film', 4), ((('warm', ','), 'funny'), ('warm', ','), 'funny', 4), ((('engaging', 'film'), '.'), ('engaging', 'film'), '.', 3), (('A', (('warm', ','), 'funny')), 'A', (('warm', ','), 'funny'), 4), ((',', (('engaging', 'film'), '.')), ',', (('engaging', 'film'), '.'), 3), ((('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.'))), ('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.')), 4)] """ all_tokens = [] # place to store the final result collected_tokens = set() while len(trees) > 0: shallower_trees = [] # collect the leaf nodes for t in trees: tokens_with_labels = set(ptb.get_leaves_with_labels(t)) # not all tokens are harvested # only the new ones new_tokens_with_labels = [] for token, label in tokens_with_labels: if token not in collected_tokens: new_tokens_with_labels.append((token, label)) if new_tokens_with_labels: tokens, labels = zip(*new_tokens_with_labels) else: continue # nothing to add # add new tokens, their children and their labels all_tokens += [ (tok, ) + # the token ((tok[0], tok[1]) # children node id if has children if isinstance(tok, tuple) else (None, None)) + #for single words (l, ) # the label for tok, l in zip(tokens, labels) ] collected_tokens |= set(tokens) try: shallower_trees.append(merge_leaves(t)) except CannotMergeAnyMoreException: pass trees = shallower_trees# we consider the shallower trees now return all_tokens