def main(): if len(sys.argv) != 2: print('usage: python postagger-runner.py model-prefix < input > output', file=sys.stderr) return model_prefix = sys.argv[1] # load dictionary word_ids = Dictionary.load(model_prefix + '.wordid') pos_ids = Dictionary.load(model_prefix + '.posid') # load and test tagger tagger = POSTagger.load(model_prefix) # output queue qs = [] wss = [] # input iterator def iterate_words(): for l in sys.stdin: ls = l.strip('\n').split(' ') wss.append(ls) for w in ls: yield word_ids[w] for w, p in tagger.iterate(iterate_words()): qs.append(pos_ids.get_name(p)) if len(qs) >= len(wss[0]): print(' '.join('%s/%s' % wq for wq in zip(wss[0], qs))) sys.stdout.flush() qs = [] wss.pop(0)
def main(): if len(sys.argv) != 3: print('usage: python postagger-test.py', file=sys.stderr) print(' <str: test prefix>', file=sys.stderr) print(' <str: model prefix>', file=sys.stderr) return test_prefix = sys.argv[1] model_prefix = sys.argv[2] print('loading data ...', file=sys.stderr) # load test data test_words = [w.lower() for w in utils.read_data(test_prefix + '.words')] test_pos = utils.read_data(test_prefix + '.pos') # load dictionary word_ids = Dictionary.load(model_prefix + '.wordid') pos_ids = Dictionary.load(model_prefix + '.posid') # make word/POS IDs test_wids = [word_ids[w] for w in test_words] test_pids = [pos_ids[w] for w in test_pos] # load and test tagger tagger = POSTagger.load(model_prefix) tagger.test(test_wids, test_pids)
def main(): if len(sys.argv) != 9: print('usage: python postagger-train.py', file=sys.stderr) print(' <str: train prefix>', file=sys.stderr) print(' <str: dev prefix>', file=sys.stderr) print(' <str: model prefix>', file=sys.stderr) print(' <int: word n-gram size>', file=sys.stderr) print(' <int: POS n-gram size>', file=sys.stderr) print(' <int: word window size>', file=sys.stderr) print(' <int: POS history size>', file=sys.stderr) print(' <int: max iteration>', file=sys.stderr) return train_prefix = sys.argv[1] dev_prefix = sys.argv[2] model_prefix = sys.argv[3] word_ngram_size = int(sys.argv[4]) pos_ngram_size = int(sys.argv[5]) word_window_size = int(sys.argv[6]) pos_history_size = int(sys.argv[7]) max_iteration = int(sys.argv[8]) print('loading data ...', file=sys.stderr) # load train/dev data train_words = [w.lower() for w in utils.read_data(train_prefix + '.words')] train_pos = utils.read_data(train_prefix + '.pos') dev_words = [w.lower() for w in utils.read_data(dev_prefix + '.words')] dev_pos = utils.read_data(dev_prefix + '.pos') # make dictionary word_ids = Dictionary(train_words, frozen=True) pos_ids = Dictionary(train_pos, frozen=True) word_ids.save(model_prefix + '.wordid') pos_ids.save(model_prefix + '.posid') # make word/POS IDs train_wids = [word_ids[w] for w in train_words] train_pids = [pos_ids[w] for w in train_pos] dev_wids = [word_ids[w] for w in dev_words] dev_pids = [pos_ids[w] for w in dev_pos] # train tagger = POSTagger(word_ngram_size, pos_ngram_size, word_window_size, pos_history_size) tagger.train(len(pos_ids), train_wids, train_pids, dev_wids, dev_pids, max_iteration, model_prefix)