def build_vocab(dataset): word_freq = Counter() pos_freq = Counter() for paragraph in chain(*dataset): for edu in paragraph.edus(): word_freq.update(edu.words) pos_freq.update(edu.tags) word_vocab = Vocab("word", word_freq) pos_vocab = Vocab("part of speech", pos_freq) return word_vocab, pos_vocab
def build_vocab(dataset): word_freq = Counter() pos_freq = Counter() for paragraph in chain(*dataset): for edu in paragraph.edus(): word_freq.update(edu.words) pos_freq.update(edu.tags) word_vocab = Vocab("word", word_freq) pos_vocab = Vocab("part of speech", pos_freq) gcn_vocab = Vocab("gcn tag", Counter(["dep", "head", "self"])) return word_vocab, pos_vocab, gcn_vocab
def build_vocab(instances): words_counter = Counter() poses_counter = Counter() trans_counter = Counter() for words, poses, trans in instances: words_counter.update(chain(*words)) poses_counter.update(chain(*poses)) trans_counter.update(trans) word_vocab = Vocab("word", words_counter) pos_vocab = Vocab("part of speech", poses_counter) trans_label = Label("transition", trans_counter) return word_vocab, pos_vocab, trans_label
def build_vocab(trees, trans): trans_label = Label("transition", Counter(chain(*trans))) words_counter = Counter() poses_counter = Counter() for tree in trees: edus = list(tree.edus()) words = [getattr(edu, "words") for edu in edus] poses = [getattr(edu, "tags") for edu in edus] words_counter.update(chain(*words)) poses_counter.update(chain(*poses)) word_vocab = Vocab("word", words_counter) pos_vocab = Vocab("part of speech", poses_counter) return word_vocab, pos_vocab, trans_label
def build_vocab(dataset): word_freq = Counter() pos_freq = Counter() nuc_freq = Counter() rel_freq = Counter() for paragraph in chain(*dataset): for node in paragraph.iterfind(filter=node_type_filter([EDU, Relation])): if isinstance(node, EDU): word_freq.update(node.words) pos_freq.update(node.tags) elif isinstance(node, Relation): nuc_freq[node.nuclear] += 1 rel_freq[node.ftype] += 1 word_vocab = Vocab("word", word_freq) pos_vocab = Vocab("part of speech", pos_freq) nuc_label = Label("nuclear", nuc_freq) rel_label = Label("relation", rel_freq) return word_vocab, pos_vocab, nuc_label, rel_label