示例#1
0
def main():
    if len(sys.argv) != 2:
        print('usage: python postagger-runner.py model-prefix < input > output', file=sys.stderr)
        return

    model_prefix = sys.argv[1]

    # load dictionary
    word_ids = Dictionary.load(model_prefix + '.wordid')
    pos_ids = Dictionary.load(model_prefix + '.posid')

    # load and test tagger
    tagger = POSTagger.load(model_prefix)
    
    # output queue
    qs = []
    wss = []

    # input iterator
    def iterate_words():
        for l in sys.stdin:
            ls = l.strip('\n').split(' ')
            wss.append(ls)
            for w in ls:
                yield word_ids[w]

    for w, p in tagger.iterate(iterate_words()):
        qs.append(pos_ids.get_name(p))
        if len(qs) >= len(wss[0]):
            print(' '.join('%s/%s' % wq for wq in zip(wss[0], qs)))
            sys.stdout.flush()
            qs = []
            wss.pop(0)
示例#2
0
def main():
    if len(sys.argv) != 3:
        print('usage: python postagger-test.py', file=sys.stderr)
        print('                <str: test prefix>', file=sys.stderr)
        print('                <str: model prefix>', file=sys.stderr)
        return

    test_prefix = sys.argv[1]
    model_prefix = sys.argv[2]

    print('loading data ...', file=sys.stderr)

    # load test data
    test_words = [w.lower() for w in utils.read_data(test_prefix + '.words')]
    test_pos = utils.read_data(test_prefix + '.pos')

    # load dictionary
    word_ids = Dictionary.load(model_prefix + '.wordid')
    pos_ids = Dictionary.load(model_prefix + '.posid')

    # make word/POS IDs
    test_wids = [word_ids[w] for w in test_words]
    test_pids = [pos_ids[w] for w in test_pos]

    # load and test tagger
    tagger = POSTagger.load(model_prefix)
    tagger.test(test_wids, test_pids)
示例#3
0
def main():
    if len(sys.argv) != 9:
        print('usage: python postagger-train.py', file=sys.stderr)
        print('                <str: train prefix>', file=sys.stderr)
        print('                <str: dev prefix>', file=sys.stderr)
        print('                <str: model prefix>', file=sys.stderr)
        print('                <int: word n-gram size>', file=sys.stderr)
        print('                <int: POS n-gram size>', file=sys.stderr)
        print('                <int: word window size>', file=sys.stderr)
        print('                <int: POS history size>', file=sys.stderr)
        print('                <int: max iteration>', file=sys.stderr)
        return

    train_prefix = sys.argv[1]
    dev_prefix = sys.argv[2]
    model_prefix = sys.argv[3]
    word_ngram_size = int(sys.argv[4])
    pos_ngram_size = int(sys.argv[5])
    word_window_size = int(sys.argv[6])
    pos_history_size = int(sys.argv[7])
    max_iteration = int(sys.argv[8])

    print('loading data ...', file=sys.stderr)

    # load train/dev data
    train_words = [w.lower() for w in utils.read_data(train_prefix + '.words')]
    train_pos = utils.read_data(train_prefix + '.pos')
    dev_words = [w.lower() for w in utils.read_data(dev_prefix + '.words')]
    dev_pos = utils.read_data(dev_prefix + '.pos')

    # make dictionary
    word_ids = Dictionary(train_words, frozen=True)
    pos_ids = Dictionary(train_pos, frozen=True)
    word_ids.save(model_prefix + '.wordid')
    pos_ids.save(model_prefix + '.posid')

    # make word/POS IDs
    train_wids = [word_ids[w] for w in train_words]
    train_pids = [pos_ids[w] for w in train_pos]
    dev_wids = [word_ids[w] for w in dev_words]
    dev_pids = [pos_ids[w] for w in dev_pos]

    # train
    tagger = POSTagger(word_ngram_size, pos_ngram_size, word_window_size, pos_history_size)
    tagger.train(len(pos_ids), train_wids, train_pids, dev_wids, dev_pids, max_iteration, model_prefix)