예제 #1
0
 opt = OptionParser()
 # insert options here
 opt.add_option('--test', dest='test_file', default='')
 opt.add_option('--train', dest='train_file', default='')
 opt.add_option('--feats', dest='feats_file', default='')
 (options, _) = opt.parse_args()
 if options.feats_file == '' or options.train_file == '' or options.test_file == '':
     sys.stderr.write("Usage: jython tagger-of.py --feats [feats file] --train [train file] --test [test file]\n")
     exit(1)
 tag_list, obs_list, factor_cell_to_features, feature_label2id = load_factor_features(options.feats_file)
 sys.stderr.write("prepare training instances... \n")
 training_instances = make_instances(options.train_file, tag_list, obs_list)
 trainer = CrfTrainer(get_trainer_prm())
 factor_graph_model = FgModel(len(feature_label2id))
 sys.stderr.write("training... \n")
 trainer.train(factor_graph_model, training_instances)
 sys.stderr.write("testing... \n")
 testing_instances = make_instances(options.test_file, tag_list, obs_list)
 correct_label_count = 0
 total_lable_count = 0
 for test_idx in range(testing_instances.size()):
     test_instance = testing_instances.get(test_idx)
     gold_config = test_instance.getGoldConfig()
     decoder = MbrDecoder(MbrDecoderPrm())
     decoder.decode(factor_graph_model, test_instance)
     predicted_config = decoder.getMbrVarConfig()
     predictions = dict((v.getName(), predicted_config.getStateName(v)) for v in predicted_config.getVars() if
                        v.getName().startswith('TAG'))
     gold = dict((v.getName(), gold_config.getStateName(v)) for v in gold_config.getVars() if
                 v.getName().startswith('TAG'))
     assert len(gold) == len(predictions)
예제 #2
0
        if i.strip() != ''
    ]
    de_vocab = [
        i.strip()
        for i in codecs.open(options.de_vocab, 'r', 'utf8').readlines()
        if i.strip() != ''
    ]
    for env in en_vocab:
        add_to_tags(env)
    uc_training = UnCachedFgList(training_instanes=training_ti,
                                 en_vocab=en_vocab)
    for idx, ti in enumerate(training_ti):
        print idx, uc_training.get(idx)
    trainer = CrfTrainer(get_trainer_prm())
    exit(1)
    feature_ids, feature_labels = zip(
        *sorted([(v, k) for k, v in feature_label2id.iteritems()]))
    # initialize weight for each feature
    factor_graph_model = FgModel(len(feature_label2id), list(feature_labels))
    for fid in list(feature_ids):
        factor_graph_model.add(fid, 0.0)

    trainer.train(factor_graph_model, uc_training)
    sw = FileWriter('feature.weights')
    factor_graph_model.printModel(sw)
    sw = codecs.open('feature.names', 'w', 'utf8')
    for k, i in feature_label2id.iteritems():
        sw.write(str(i) + '\t' + str(k) + '\n')
    sw.flush()
    sw.close()
예제 #3
0
        ti, obs, guess = get_instance(line)
        training_ti.append(ti)

    for line in open(options.test_file).readlines():
        ti, obs, guess = get_instance(line)
        testing_ti.append(ti)

    en_vocab = [i.strip() for i in codecs.open(options.en_vocab, 'r', 'utf8').readlines() if i.strip() != '']
    de_vocab = [i.strip() for i in codecs.open(options.de_vocab, 'r', 'utf8').readlines() if i.strip() != '']
    for env in en_vocab:
        add_to_tags(env)
    uc_training = UnCachedFgList(training_instanes=training_ti, en_vocab=en_vocab)
    for idx, ti in enumerate(training_ti):
        print idx, uc_training.get(idx)
    trainer = CrfTrainer(get_trainer_prm())
    exit(1)
    feature_ids, feature_labels = zip(*sorted([(v, k) for k, v in feature_label2id.iteritems()]))
    # initialize weight for each feature
    factor_graph_model = FgModel(len(feature_label2id), list(feature_labels))
    for fid in list(feature_ids):
        factor_graph_model.add(fid, 0.0)

    trainer.train(factor_graph_model, uc_training)
    sw = FileWriter('feature.weights')
    factor_graph_model.printModel(sw)
    sw = codecs.open('feature.names', 'w', 'utf8')
    for k, i in feature_label2id.iteritems():
        sw.write(str(i) + '\t' + str(k) + '\n')
    sw.flush()
    sw.close()