Пример #1
0

if __name__ == '__main__':
    opt = OptionParser()
    # insert options here
    opt.add_option('--test', dest='test_file', default='')
    opt.add_option('--train', dest='train_file', default='')
    opt.add_option('--feats', dest='feats_file', default='')
    (options, _) = opt.parse_args()
    if options.feats_file == '' or options.train_file == '' or options.test_file == '':
        sys.stderr.write("Usage: jython tagger-of.py --feats [feats file] --train [train file] --test [test file]\n")
        exit(1)
    tag_list, obs_list, factor_cell_to_features, feature_label2id = load_factor_features(options.feats_file)
    sys.stderr.write("prepare training instances... \n")
    training_instances = make_instances(options.train_file, tag_list, obs_list)
    trainer = CrfTrainer(get_trainer_prm())
    factor_graph_model = FgModel(len(feature_label2id))
    sys.stderr.write("training... \n")
    trainer.train(factor_graph_model, training_instances)
    sys.stderr.write("testing... \n")
    testing_instances = make_instances(options.test_file, tag_list, obs_list)
    correct_label_count = 0
    total_lable_count = 0
    for test_idx in range(testing_instances.size()):
        test_instance = testing_instances.get(test_idx)
        gold_config = test_instance.getGoldConfig()
        decoder = MbrDecoder(MbrDecoderPrm())
        decoder.decode(factor_graph_model, test_instance)
        predicted_config = decoder.getMbrVarConfig()
        predictions = dict((v.getName(), predicted_config.getStateName(v)) for v in predicted_config.getVars() if
                           v.getName().startswith('TAG'))
Пример #2
0
        i.strip()
        for i in codecs.open(options.en_vocab, 'r', 'utf8').readlines()
        if i.strip() != ''
    ]
    de_vocab = [
        i.strip()
        for i in codecs.open(options.de_vocab, 'r', 'utf8').readlines()
        if i.strip() != ''
    ]
    for env in en_vocab:
        add_to_tags(env)
    uc_training = UnCachedFgList(training_instanes=training_ti,
                                 en_vocab=en_vocab)
    for idx, ti in enumerate(training_ti):
        print idx, uc_training.get(idx)
    trainer = CrfTrainer(get_trainer_prm())
    exit(1)
    feature_ids, feature_labels = zip(
        *sorted([(v, k) for k, v in feature_label2id.iteritems()]))
    # initialize weight for each feature
    factor_graph_model = FgModel(len(feature_label2id), list(feature_labels))
    for fid in list(feature_ids):
        factor_graph_model.add(fid, 0.0)

    trainer.train(factor_graph_model, uc_training)
    sw = FileWriter('feature.weights')
    factor_graph_model.printModel(sw)
    sw = codecs.open('feature.names', 'w', 'utf8')
    for k, i in feature_label2id.iteritems():
        sw.write(str(i) + '\t' + str(k) + '\n')
    sw.flush()
Пример #3
0
    for line in codecs.open(options.train_file, 'r', 'utf8').readlines():
        ti, obs, guess = get_instance(line)
        training_ti.append(ti)

    for line in open(options.test_file).readlines():
        ti, obs, guess = get_instance(line)
        testing_ti.append(ti)

    en_vocab = [i.strip() for i in codecs.open(options.en_vocab, 'r', 'utf8').readlines() if i.strip() != '']
    de_vocab = [i.strip() for i in codecs.open(options.de_vocab, 'r', 'utf8').readlines() if i.strip() != '']
    for env in en_vocab:
        add_to_tags(env)
    uc_training = UnCachedFgList(training_instanes=training_ti, en_vocab=en_vocab)
    for idx, ti in enumerate(training_ti):
        print idx, uc_training.get(idx)
    trainer = CrfTrainer(get_trainer_prm())
    exit(1)
    feature_ids, feature_labels = zip(*sorted([(v, k) for k, v in feature_label2id.iteritems()]))
    # initialize weight for each feature
    factor_graph_model = FgModel(len(feature_label2id), list(feature_labels))
    for fid in list(feature_ids):
        factor_graph_model.add(fid, 0.0)

    trainer.train(factor_graph_model, uc_training)
    sw = FileWriter('feature.weights')
    factor_graph_model.printModel(sw)
    sw = codecs.open('feature.names', 'w', 'utf8')
    for k, i in feature_label2id.iteritems():
        sw.write(str(i) + '\t' + str(k) + '\n')
    sw.flush()
    sw.close()