opt = OptionParser() # insert options here opt.add_option('--test', dest='test_file', default='') opt.add_option('--train', dest='train_file', default='') opt.add_option('--feats', dest='feats_file', default='') (options, _) = opt.parse_args() if options.feats_file == '' or options.train_file == '' or options.test_file == '': sys.stderr.write("Usage: jython tagger-of.py --feats [feats file] --train [train file] --test [test file]\n") exit(1) tag_list, obs_list, factor_cell_to_features, feature_label2id = load_factor_features(options.feats_file) sys.stderr.write("prepare training instances... \n") training_instances = make_instances(options.train_file, tag_list, obs_list) trainer = CrfTrainer(get_trainer_prm()) factor_graph_model = FgModel(len(feature_label2id)) sys.stderr.write("training... \n") trainer.train(factor_graph_model, training_instances) sys.stderr.write("testing... \n") testing_instances = make_instances(options.test_file, tag_list, obs_list) correct_label_count = 0 total_lable_count = 0 for test_idx in range(testing_instances.size()): test_instance = testing_instances.get(test_idx) gold_config = test_instance.getGoldConfig() decoder = MbrDecoder(MbrDecoderPrm()) decoder.decode(factor_graph_model, test_instance) predicted_config = decoder.getMbrVarConfig() predictions = dict((v.getName(), predicted_config.getStateName(v)) for v in predicted_config.getVars() if v.getName().startswith('TAG')) gold = dict((v.getName(), gold_config.getStateName(v)) for v in gold_config.getVars() if v.getName().startswith('TAG')) assert len(gold) == len(predictions)
if i.strip() != '' ] de_vocab = [ i.strip() for i in codecs.open(options.de_vocab, 'r', 'utf8').readlines() if i.strip() != '' ] for env in en_vocab: add_to_tags(env) uc_training = UnCachedFgList(training_instanes=training_ti, en_vocab=en_vocab) for idx, ti in enumerate(training_ti): print idx, uc_training.get(idx) trainer = CrfTrainer(get_trainer_prm()) exit(1) feature_ids, feature_labels = zip( *sorted([(v, k) for k, v in feature_label2id.iteritems()])) # initialize weight for each feature factor_graph_model = FgModel(len(feature_label2id), list(feature_labels)) for fid in list(feature_ids): factor_graph_model.add(fid, 0.0) trainer.train(factor_graph_model, uc_training) sw = FileWriter('feature.weights') factor_graph_model.printModel(sw) sw = codecs.open('feature.names', 'w', 'utf8') for k, i in feature_label2id.iteritems(): sw.write(str(i) + '\t' + str(k) + '\n') sw.flush() sw.close()
ti, obs, guess = get_instance(line) training_ti.append(ti) for line in open(options.test_file).readlines(): ti, obs, guess = get_instance(line) testing_ti.append(ti) en_vocab = [i.strip() for i in codecs.open(options.en_vocab, 'r', 'utf8').readlines() if i.strip() != ''] de_vocab = [i.strip() for i in codecs.open(options.de_vocab, 'r', 'utf8').readlines() if i.strip() != ''] for env in en_vocab: add_to_tags(env) uc_training = UnCachedFgList(training_instanes=training_ti, en_vocab=en_vocab) for idx, ti in enumerate(training_ti): print idx, uc_training.get(idx) trainer = CrfTrainer(get_trainer_prm()) exit(1) feature_ids, feature_labels = zip(*sorted([(v, k) for k, v in feature_label2id.iteritems()])) # initialize weight for each feature factor_graph_model = FgModel(len(feature_label2id), list(feature_labels)) for fid in list(feature_ids): factor_graph_model.add(fid, 0.0) trainer.train(factor_graph_model, uc_training) sw = FileWriter('feature.weights') factor_graph_model.printModel(sw) sw = codecs.open('feature.names', 'w', 'utf8') for k, i in feature_label2id.iteritems(): sw.write(str(i) + '\t' + str(k) + '\n') sw.flush() sw.close()