def load_from_file(fname, vocab=None, alphabet=None, tags=None): """Load dataset from the given file.""" reader = conllu.reader(fname) learn_tags, learn_vocab, tagset, vocab, alphabet = \ TaggingDataset.initialize_vocab_and_tags(tags, vocab, alphabet) seqs = [] for sentence in reader: words = [] tags = [] chars = [] lemma_chars = [] for word in sentence: word_id, char_ids, tag_id, lemma_char_ids = \ TaggingDataset.get_word_and_tag_id(word, vocab, alphabet, tagset, learn_vocab, learn_tags) words.append(word_id) chars.append(char_ids) tags.append(tag_id) lemma_chars.append(lemma_char_ids) seqs.append((words, chars, tags, lemma_chars)) res = TaggingDataset(seqs, vocab, alphabet, tagset) return res
def evaluate_tagger_and_writeout(tagger): stdin = conllu.reader() stdout = conllu.writer() for sentence in stdin: x = [] for word in sentence: x.append(tagger.vocab.get(TaggingDataset.word_obj_to_str(word), tagger.vocab['#OOV'])) x = np.array([x], dtype='int32') y_hat = tagger.predict(x)[0] y_hat_str = [tagger.tags.rev(tag_id) for tag_id in y_hat] for word, utag in zip(sentence, y_hat_str): word.upos = utag stdout.write_sentence(sentence)
def evaluate_tagger_and_writeout(tagger): stdin = conllu.reader() stdout = conllu.writer() for sentence in stdin: x = [] for word in sentence: x.append( tagger.vocab.get(TaggingDataset.word_obj_to_str(word), tagger.vocab['#OOV'])) x = np.array([x], dtype='int32') y_hat = tagger.predict(x)[0] y_hat_str = [tagger.tags.rev(tag_id) for tag_id in y_hat] for word, utag in zip(sentence, y_hat_str): word.upos = utag stdout.write_sentence(sentence)
def load_from_file(fname, vocab=None, tags=None): """Load dataset from the given file.""" reader = conllu.reader(fname) learn_tags, learn_vocab, tags, vocab = TaggingDataset.initialize_vocab_and_tags(tags, vocab) seqs = [] for sentence in reader: x = [] y = [] for word in sentence: word_id, tag_id = TaggingDataset.get_word_and_tag_id(word, vocab, tags, learn_vocab, learn_tags) x.append(word_id) y.append(tag_id) seqs.append((x, y)) res = TaggingDataset(seqs, vocab, tags) return res
def load_from_file(fname, vocab=None, tags=None): """Load dataset from the given file.""" reader = conllu.reader(fname) learn_tags, learn_vocab, tags, vocab = TaggingDataset.initialize_vocab_and_tags( tags, vocab) seqs = [] for sentence in reader: x = [] y = [] for word in sentence: word_id, tag_id = TaggingDataset.get_word_and_tag_id( word, vocab, tags, learn_vocab, learn_tags) x.append(word_id) y.append(tag_id) seqs.append((x, y)) res = TaggingDataset(seqs, vocab, tags) return res
# Mathematics and Physics, Charles University in Prague, Czech Republic. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. import collections import sys import conllu dictionary = {} # Process all arguments as training files for arg in sys.argv[1:]: reader = conllu.reader(arg) sentence = [] while reader.next_sentence(sentence): for word in sentence: dictionary.setdefault( word.form, collections.defaultdict(lambda: 0))["\t".join( [word.lemma, word.upos, word.lpos, word.feats])] += 1 # Find most frequent analysis, using the lexicographically smaller when equal for form in dictionary: best, best_count = '', 0 for analysis, count in dictionary[form].iteritems(): if count > best_count or (count == best_count and analysis < best): best, best_count = analysis, count dictionary[form] = best
# Mathematics and Physics, Charles University in Prague, Czech Republic. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. import collections import sys import conllu dictionary = {} # Process all arguments as training files for arg in sys.argv[1:]: reader = conllu.reader(arg) sentence = [] while reader.next_sentence(sentence): for word in sentence: dictionary.setdefault(word.form, collections.defaultdict(lambda:0))["\t".join([word.lemma, word.upos, word.lpos, word.feats])] += 1 # Find most frequent analysis, using the lexicographically smaller when equal for form in dictionary: best, best_count = '', 0 for analysis, count in dictionary[form].iteritems(): if count > best_count or (count == best_count and analysis < best): best, best_count = analysis, count dictionary[form] = best # Analyse all data passed on standard input to standard output stdin = conllu.reader()
#!/usr/bin/env python # This file is part of RH_NNTagging <http://github.com/ufal/rh_nntagging/>. # # Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of # Mathematics and Physics, Charles University in Prague, Czech Republic. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. # Remove all annotations from CoNLL-U file except for form. # Reads from files given as arguments and writes to standard output. import sys import conllu stdout = conllu.writer() sentence = [] for arg in sys.argv[1:]: reader = conllu.reader(arg) while reader.next_sentence(sentence): for word in sentence: word.lemma, word.upos, word.lpos, word.feats, word.head, word.deprel, word.deps, word.misc = '', '', '', '', -1, '', '', '' stdout.write_sentence(sentence)