def tag_test(test_feature_file, trained_model_file, tag_test_set_file): fin = codecs.open(test_feature_file, 'r', 'utf-8') fout = codecs.open(tag_test_set_file, 'w', 'utf-8') m = MaxentModel() m.load(trained_model_file) contents = fin.read() feature_list = contents.split('\r') feature_list.remove('\n') for feature in feature_list: if (feature == 'split'): fout.write('\n\n\n') continue str_feature = [] u_feature = feature.split(' ') for item in u_feature: str_feature.append(item.encode('utf-8')) label_prob_list = m.eval_all(str_feature) label = max_prob(label_prob_list) try: new_tag = str_feature[2].split('=')[1] + '/' + label except IndexError: print str_feature fout.write(new_tag.decode('utf-8')) pre_tag = label return feature_list
def __init__(self, restrictFeatures=False): Classifier.__init__(self) print "MaximumEntropy: Creating model" self.model = MaxentModel() self.model.verbose = 1 self.restrictFeatures = restrictFeatures self.model.begin_add_event()
def simple_train(event_list): m = MaxentModel() m.begin_add_event() for e in event_list: m.add_event(e[0], e[1]) m.end_add_event() #maxent.set_verbose(1) m.train(30, 'lbfgs', 2) return m
def train(corpus, *args): projections = {} model = MaxentModel() model.begin_add_event() for datums in corpus.values(): for datum in datums: projection = datum2features(datum) model.add_event(datum2features(datum), datum.is_related, long(100 * float(datum._trust))) projections[datum.row_in_corpus] = projection model.end_add_event() model.train(*args) return model, projections
def train_model(options, iterable): model = MaxentModel() data = {} data["feature_set"] = set() data["word_frequencies"] = defaultdict(long) # XXX(sandello): defaultdict(lambda: defaultdict(long)) would be # a better choice here (for |labelled_words|) but it could not be pickled. # C'est la vie. data["labelled_words"] = dict() print >> sys.stderr, "*** Training options are:" print >> sys.stderr, " ", options print >> sys.stderr, "*** First pass: Computing statistics..." for n, sentence in enumerate(iterable): if (n % 1000) == 0: print >> sys.stderr, " {0:6d} sentences...".format(n) for word, pos, label in sentence: data["word_frequencies"][word] += 1 if label.startswith("B-") or label.startswith("I-"): if word in data["labelled_words"]: data["labelled_words"][word][label] += 1 else: data["labelled_words"][word] = defaultdict(long) print >> sys.stderr, "*** Second pass: Collecting features..." model.begin_add_event() for n, sentence in enumerate(iterable): if (n % 1000) == 0: print >> sys.stderr, " {0:6d} sentences...".format(n) words, poses, labels = map(list, zip(*sentence)) for i in xrange(len(labels)): features = compute_features(data, words, poses, i, labels[i - 1] if i >= 1 else "^") features = list(features) model.add_event(features, labels[i]) for feature in features: data["feature_set"].add(feature) model.end_add_event(options.cutoff) print >> sys.stderr, "*** Collected {0} features.".format( len(data["feature_set"])) print >> sys.stderr, "*** Training..." maxent.set_verbose(1) model.train(options.iterations, options.technique, options.gaussian) maxent.set_verbose(0) print >> sys.stderr, "*** Saving..." model.save(options.model + ".maxent") with open(options.model + ".data", "w") as handle: cPickle.dump(data, handle)
def main(): usage = "usage: %prog [options] -m model file" parser = OptionParser(usage) parser.add_option("-o", "--output", type="string", help="write tagged result to OUTPUT") parser.add_option("-m", "--model", type="string", help="load trained model from MODEL") parser.add_option("-t", "--test", action="store_true", default=0, help="test mode, include original tag in output") parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=1) parser.add_option("-q", "--quiet", action="store_false", dest="verbose") parser.add_option("-T", "--type", type="int", default=None, help="choose context type") (options, args) = parser.parse_args() if not options.model: print >> sys.stderr, 'Tagger model name not given!' parser.print_usage() sys.exit(1) model = options.model tag_dict = cPickle.load(open(model + '.tagdict')) me = MaxentModel() me.load(model) tagger = postagger.PosTagger(me, tag_dict, options.type) tag_in_file = sys.stdin if len(args) >= 1: tag_in_file = open(args[0]) tag_out_file = sys.stdout if options.output: tag_out_file = open(out, 'w') tag_file(tagger, tag_in_file, tag_out_file, options.test)
def main(): if len(sys.argv) != 2: print "Usage: MaxentTest.py modelName" sys.exit(1) model = MaxentModel() model.load(sys.argv[1]) context = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] label = model.eval(context, str(0)) #result = model.eval_all(context) print "Result: ", label
def predict_tags(best_1_name, best_1_org, best_3_name, best_5_org, sentences, f, op): rel = [ 'others', 'director', 'analyst', 'advisor', 'head', 'manager', 'spokesperson', 'founder', 'professor', 'leave', 'lawyer' ] me = MaxentModel() me.load('../training/models/lbfgs/model3') count = 0 for n1, o1, n3, o5, sent in zip(best_1_name, best_1_org, best_3_name, best_5_org, sentences): if len(n3) == 0 or len(o5) == 0: op.write(str((n1, o1)) + '\n') else: j = ('', '', '', 0.0) d = {} for name in n3: for org in o5: context = get_context(name, org, sent) relation = '' prob = 0.0 if context != None: for r in rel: y = me.eval(context, r) if y > prob: prob = y relation = r #set_r.append((name,org,relation,prob)) d[(name, org)] = relation if prob > j[3] and relation != 'others': j = (name, org, relation, prob) else: d[(name, org)] = 'others' #print str(count)+' before : '+str(n1)+'\t'+str(o1) resolve_conflicts(n1, o1, j) #print str(count)+' after : '+str(n1)+'\t'+str(o1) #x = raw_input() op.write(str((n1, o1)) + '\n') f.write(str(j) + '\n') count = count + 1
def eval_model(options, iterable): model = MaxentModel() data = {} print >> sys.stderr, "*** Loading..." model.load(options.model + ".maxent") with open(options.model + ".data", "r") as handle: data = cPickle.load(handle) print >> sys.stderr, "*** Evaluating..." for n, sentence in enumerate(iterable): if (n % 100) == 0: print >> sys.stderr, " {0:6d} sentences...".format(n) words, poses = map(list, zip(*sentence)) labels = eval_model_sentence(options, data, model, words, poses) for word, pos, label in zip(words, poses, labels): print label print
def training(feature_file_path, trained_model_file, times): m = MaxentModel() fin = codecs.open(feature_file_path, 'r', 'utf-8') all_list = [] m.begin_add_event() for line in fin: line = line.rstrip() line_list = line.split(' ') str_list = [] for item in line_list: str_list.append(item.encode('utf-8')) all_list.append(str_list) m.add_event(str_list[1:], str_list[0], 1) m.end_add_event() print 'begin training' m.train(times, "lbfgs") print 'end training' m.save(trained_model_file) return all_list
def main(): if len(sys.argv) != 4: print "Usage: MaxentTrain.py features.mat labels.mat modelName" sys.exit(1) features = featureMatrice(sys.argv[1]) labels = labelLst(sys.argv[2]) model = MaxentModel() # add data into model model.begin_add_event() for i in range(len(labels)): model.add_event(features[i], str(labels[i]), 1) model.end_add_event() # start training #model.train() model.train(1000, "gis", 2) #model.train(30, "lbfgs") # save the model model.save(sys.argv[3])
def main(): global feat_dict, me # parsing options{{{ usage = "usage: %prog [options] model" parser = OptionParser(usage) parser.add_option("-f", "--file", type="string", dest="filename", metavar="FILE", help="train a ME model with data from FILE") parser.add_option("--heldout", type="string", metavar="FILE", help="use heldout events from FILE") parser.add_option("--extract", type="string", metavar="FILE", help="extract training data to file") parser.add_option("--events_out", type="string", help="write training(heldout) events to file") parser.add_option( "-c", "--cutoff", type="int", default=10, help="discard feature with frequency < CUTOFF when training\ [default=10]") parser.add_option( "-r", "--rare", type="int", default=5, help="use special feature for rare word with frequency < RARE \ [default=5]") parser.add_option("-g", "--gaussian", type="float", default=0.0, help="apply Gaussian penality when training \ [default=0.0]") parser.add_option( "-b", "--binary", action="store_true", default=0, help="save events in binary format for fast loading [default=off]") parser.add_option( "--ev_cutoff", type="int", default=1, help="discard event with frequency < CUTOFF when training \ [default=1]") parser.add_option( "--iters", type="int", default=15, help="how many iterations are required for training[default=15]") parser.add_option("-T", "--type", type="int", default=None, help="choose context type [default for English]") (options, args) = parser.parse_args() #}}} if options.filename: file = open(options.filename) else: print 'training file not given' parser.print_usage() sys.exit(1) if len(args) != 1: print >> sys.stderr, 'model name not given' parser.print_usage() sys.exit(1) model_name = args[0] global rare_freq rare_freq = options.rare global get_context get_context = postagger.choose_context(options.type) # First pass: gather word frequency information {{{ print 'First pass: gather word frequency information' gather_word_freq(file) print '%d words found in training data' % len(word_freq) word_freq_file = options.filename + '.wordfreq' print 'Saving word frequence information to %s' % col( word_freq_file, 'lgreen') save_word_freq(word_freq_file) print # }}} # Second pass: gather features and tag dict {{{ file.seek(0) print 'Second pass: gather features and tag dict to be used in tagger' print 'feature cutoff:%d' % options.cutoff print 'rare word freq:%d' % options.rare extract_feature(file, gather_feature) print '%d features found' % len(feat_dict) print '%d words found in pos dict' % len(tag_dict) print 'Applying cutoff %d to features' % options.cutoff cutoff_feature(options.cutoff, options.rare) print '%d features remained after cutoff' % len(feat_dict) feature_file = model_name + '.features' print 'saving features to file %s' % feature_file save_features(feature_file) # tag_dict_file = options.filename + '.tagdict' # print 'Saving tag dict to file %s' % (col(tag_dict_file, 'lgreen')) # save_tag_dict(tag_dict_file) tagdict_file = model_name + '.tagdict' print 'Saving tag dict object to %s' % col(tagdict_file, 'lgreen'), import cPickle cPickle.dump(tag_dict, open(tagdict_file, 'w')) print 'done' #}}} if options.extract: global training_data training_data = open(options.extract, 'w') print 'Saving training data to %s' % options.extract file.seek(0) extract_feature(file, save_training_data) sys.exit(0) # Third pass:training ME model...{{{ print 'Third pass:training ME model...' me = MaxentModel() me.begin_add_event() file.seek(0) extract_feature(file, add_event) #import profile #profile.run('me.end_training()','proflog') if options.heldout: raise 'not tested' print 'adding heldout events from %s' % col(options.heldout, 'yellow') extract_feature(open(options.heldout), add_heldout_event, True) me.end_add_event(options.ev_cutoff) if options.events_out: raise 'not tested' print 'dumping training events to', col(options.events_out, 'lgreen') # import hotshot, hotshot.stats # prof = hotshot.Profile("dump_events.prof", 1) # prof.runcall(me.dump_events, options.events_out) me.dump_events(options.events_out, options.binary) sys.exit(0) me.train(options.iters, 'lbfgs', options.gaussian) print 'training finished' print 'saving tagger model to %s' % model_name, me.save(model_name) print 'done'
from maxent import MaxentModel for i in range(5): m = MaxentModel() context = [] m.begin_add_event() with open('contexts/contexts' + str(i + 1) + '.txt', 'r') as f: for line in f: line = line.rstrip() try: ind = line.index(':') if line[:ind] != '': rel = line[:ind] l = eval(line[ind + 1:]) m.add_event(l, rel, 1) except: pass m.end_add_event() m.train(100, 'lbfgs') s_name = "models/lbfgs/model" + str(i + 1) m.save(s_name)
#!/usr/bin/env python2 # -*- coding: utf-8 -*- # Imports import sys, os # Load MaxEnt models corpusPath = os.environ.get('CORPUS_PATH') from maxent import MaxentModel maxEntModel = MaxentModel() maxEntModel.load(corpusPath + '/model_markers.txt') for trainLine in sys.stdin.readlines(): trainCols = trainLine.split('\t') modelMarkerProbas = maxEntModel.eval_all(trainCols[1:]) probaFeats = [] for modelMarkerProba in modelMarkerProbas: if modelMarkerProba[1] > 0.00001: probaFeats.append(modelMarkerProba[0] + ':' + str(modelMarkerProba[1])) print trainCols[0] + '\t' + '\t'.join(probaFeats)