from treeBank import TreeBank tb_train = TreeBank.from_file("./data/train.trees") word_freq = tb_train.count_word_freq() vocab_train = set( map(lambda x: x[0], filter(lambda x: x[1] > 1, word_freq.items()))) rare_train = set( map(lambda x: x[0], filter(lambda x: x[1] == 1, word_freq.items()))) #vocab_train = tb_train.collect_vocabulary() tb_dev = TreeBank.from_file("./data/test.trees") vocab_dev = tb_dev.collect_vocabulary() diff = vocab_dev.difference(vocab_train) for w in diff: print(w) print(rare_train.intersection(vocab_dev))
def train_on(filename): tb = TreeBank.from_file(filename) tb.normalize() rule_set = tb.collect_rule_set(vertical_markov=True, vertical_markov_factor=0.8) return rule_set
#!/usr/bin/env python from treeBank import TreeBank train_words = TreeBank.from_file('./data/train.trees').collect_vocabulary() dev_words = TreeBank.from_file('./data/dev.trees').collect_vocabulary() diff = dev_words.difference(train_words) print(diff) print(len(diff))
from treeBank import TreeBank tb_train = TreeBank.from_file("./data/train.trees") word_freq = tb_train.count_word_freq() vocab_train = set(map(lambda x: x[0], filter(lambda x: x[1] > 1, word_freq.items()))) rare_train = set(map(lambda x: x[0], filter(lambda x: x[1] == 1, word_freq.items()))) #vocab_train = tb_train.collect_vocabulary() tb_dev = TreeBank.from_file("./data/test.trees") vocab_dev = tb_dev.collect_vocabulary() diff = vocab_dev.difference(vocab_train) for w in diff: print(w) print(rare_train.intersection(vocab_dev))
#!/usr/bin/env python from treeBank import TreeBank ans = max([ len(node.children) for t in TreeBank.from_file('./data/train.trees').trees for node in t.bottom_up() ]) print(ans)
#!/usr/bin/env python from treeBank import TreeBank ans = max([len(node.children) for t in TreeBank.from_file('./data/train.trees').trees for node in t.bottom_up()]) print(ans)
#!/usr/bin/env python from treeBank import TreeBank tb = TreeBank.from_file("./data/train.trees") tb.normalize() print(tb.collect_rule_set().size)