Exemplo n.º 1
0
class MaximumEntropyClassifier(Classifier):
    def __init__(self, restrictFeatures=False):
        Classifier.__init__(self)
        print "MaximumEntropy: Creating model"
        self.model = MaxentModel()
        self.model.verbose = 1
        self.restrictFeatures = restrictFeatures
        self.model.begin_add_event()

    def addToIndex(self, trainingset):
        for (vec,cls) in trainingset:
            self.addFeatureVector(vec,cls)
        
    def addFeatureVector(self, vec, cls, value=1, binary=False):
        for key in vec.keys():
            if key not in self.restrictFeatures:
                del vec[key]
        context = vec.keys()
        label = "%s" % cls
        self.model.add_event(context,label,value)

    def compile(self):
        self.model.end_add_event()
        self.model.train(30, "lbfgs", 2, 1E-03)
        #self.model.train(100, 'gis', 2)
        print "> Models trained"

    def classify(self, point, label='1', binary=False):
        result = self.model.eval(point.keys(), label)
        if result >= 0.5:
            return 1
        return -1
Exemplo n.º 2
0
class MaximumEntropyClassifier(Classifier):
    def __init__(self, restrictFeatures=False):
        Classifier.__init__(self)
        print "MaximumEntropy: Creating model"
        self.model = MaxentModel()
        self.model.verbose = 1
        self.restrictFeatures = restrictFeatures
        self.model.begin_add_event()

    def addToIndex(self, trainingset):
        for (vec, cls) in trainingset:
            self.addFeatureVector(vec, cls)

    def addFeatureVector(self, vec, cls, value=1, binary=False):
        for key in vec.keys():
            if key not in self.restrictFeatures:
                del vec[key]
        context = vec.keys()
        label = "%s" % cls
        self.model.add_event(context, label, value)

    def compile(self):
        self.model.end_add_event()
        self.model.train(30, "lbfgs", 2, 1E-03)
        #self.model.train(100, 'gis', 2)
        print "> Models trained"

    def classify(self, point, label='1', binary=False):
        result = self.model.eval(point.keys(), label)
        if result >= 0.5:
            return 1
        return -1
Exemplo n.º 3
0
def baseline(sentences, labels):

    maxent.set_verbose(1)
    m = MaxentModel()
    m.begin_add_event()

    with open(sentences) as file_content:
        sentences = file_content.readlines()
    with open(labels) as file_content:
        labels = file_content.readlines()

    for i in xrange(0, 3000):
        m.add_event(sentences[i].split(" "), labels[i].strip())

    m.end_add_event()

    m.train()

    correct = 0
    false = 0

    for i in xrange(3000, len(sentences)):
        result = m.eval(sentences[i].split(" "), "1")
        result = int(round(result))
        label = int(labels[i])
        if result == label:
            correct = correct + 1
        else:
            false = false + 1

    print "correct   :", correct
    print "false     :", false

    print("accuracy  : {:.2f}%".format(correct * 100.0 / (correct + false)))
Exemplo n.º 4
0
def main():
	global feat_dict, m

	# parsing options{{{
	usage = "usage: %prog [options] model"
    	parser = OptionParser(usage)
    	parser.add_option("-f", "--file", type="string", dest="filename",
                    metavar="FILE",
                    help="train a Maxent model with data from FILE")
    	parser.add_option("-g", "--gaussian", type="float", default=0.0, 
            help="apply Gaussian penality when training \
            [default=0.0]")
    	parser.add_option("--iters", type="int", default=15,
                    help="how many iterations are required for training[default=15]")
    	(options, args) = parser.parse_args()
	#}}}

	if options.filename:
		file = open(options.filename)
    	else:
        	print 'training file not given'
        	parser.print_usage()
        	sys.exit(1)

    	if len(args) !=1:
        	print >> sys.stderr, 'model name not given'
        	parser.print_usage()
        	sys.exit(1)

	model_name = args[0]

	global get_context
	get_context = Generator.get_context_wordform # change this to use different features


    	print 'First pass: gather features'
	extract_feature(file,gather_feature)
	feature_file = model_name + '.features'
	print 'save features to file %s' % feature_file
	save_features(feature_file)

	print 'feat_dict: ',feat_dict

	file.seek(0)
	print 'Second pass: training model...'
	m = MaxentModel()
    	m.begin_add_event()
    	extract_feature(file, add_event)
	m.end_add_event()


    	m.train(options.iters, 'lbfgs', options.gaussian)
    	print 'training finished'

   	print 'saving tagger model to %s' % model_name,
    	m.save(model_name)
    	print 'done'
Exemplo n.º 5
0
def simple_train(event_list):
    m = MaxentModel()
    m.begin_add_event()
    for e in event_list:
        m.add_event(e[0], e[1])
    m.end_add_event()
    #maxent.set_verbose(1)
    m.train(30, 'lbfgs', 2)
    return m
Exemplo n.º 6
0
def train_ne_binary_model(options, iterable):
    model = MaxentModel()
    data = {}

    data["feature_set"] = set()
    data["word_frequencies"] = defaultdict(long)
    # XXX(sandello): defaultdict(lambda: defaultdict(long)) would be
    # a better choice here (for |labelled_words|) but it could not be pickled.
    # C'est la vie.
    data["labelled_words"] = dict()

    print >>sys.stderr, "*** Training options are:"
    print >>sys.stderr, "   ", options

    print >>sys.stderr, "*** First pass: Computing statistics..."
    for n, sentence in enumerate(iterable):
        if (n % 1000) == 0:
            print >>sys.stderr, "   {0:6d} sentences...".format(n)
        for word, pos, label in sentence:
            data["word_frequencies"][word] += 1
            if label.startswith("B-") or label.startswith("I-"):
                if word not in data["labelled_words"]:
                    data["labelled_words"][word] = defaultdict(long)
                data["labelled_words"][word][label] += 1

    print >>sys.stderr, "*** Second pass: Collecting features..."
    model.begin_add_event()
    for n, sentence in enumerate(iterable):
        if (n % 1000) == 0:
            print >>sys.stderr, "   {0:6d} sentences...".format(n)
        words, poses, labels = map(list, zip(*sentence))
        for i in xrange(len(labels)):
            features = compute_ne_features(data, words, poses, i, labels[i - 1] if i >= 1 else "^")
            features = list(features)
            if labels[i].startswith("B-") or labels[i].startswith("I-"):
                model.add_event(features, "NE")
            else:
                model.add_event(features, "O")

            for feature in features:
                data["feature_set"].add(feature)
    model.end_add_event(options.cutoff)
    print >>sys.stderr, "*** Collected {0} features.".format(len(data["feature_set"]))

    print >>sys.stderr, "*** Training..."
    maxent.set_verbose(1)
    model.train(options.iterations, options.technique, options.gaussian)
    maxent.set_verbose(0)

    print >>sys.stderr, "*** Saving..."
    model.save(options.model + ".ne.binary.maxent")
    with open(options.model + ".ne.binary.data", "w") as handle:
        cPickle.dump(data, handle)
Exemplo n.º 7
0
def train(corpus, *args):
    projections = {}
    model = MaxentModel()
    model.begin_add_event()
    for datums in corpus.values():
        for datum in datums:
            projection = datum2features(datum)
            model.add_event(datum2features(datum), datum.is_related, long(100 * float(datum._trust)))
            projections[datum.row_in_corpus] = projection
    model.end_add_event()
    model.train(*args)
    return model, projections
Exemplo n.º 8
0
def train(corpus, *args):
	projections = {}
	model = MaxentModel()
	model.begin_add_event()
	for datums in corpus.values():
		for datum in datums:
			projection = datum2features(datum)
			model.add_event(datum2features(datum), datum.is_related, long(100 * float(datum._trust)))
			projections[datum.row_in_corpus] = projection
	model.end_add_event()
	model.train(*args)
	return model, projections
Exemplo n.º 9
0
def train_model(options, iterable):
    model = MaxentModel()
    data = {}

    data["feature_set"] = set()
    data["word_frequencies"] = defaultdict(long)
    # XXX(sandello): defaultdict(lambda: defaultdict(long)) would be
    # a better choice here (for |labelled_words|) but it could not be pickled.
    # C'est la vie.
    data["labelled_words"] = dict()

    print >> sys.stderr, "*** Training options are:"
    print >> sys.stderr, "   ", options

    print >> sys.stderr, "*** First pass: Computing statistics..."
    for n, sentence in enumerate(iterable):
        if (n % 1000) == 0:
            print >> sys.stderr, "   {0:6d} sentences...".format(n)
        for word, pos, label in sentence:
            data["word_frequencies"][word] += 1
            if label.startswith("B-") or label.startswith("I-"):
                if word in data["labelled_words"]:
                    data["labelled_words"][word][label] += 1
                else:
                    data["labelled_words"][word] = defaultdict(long)

    print >> sys.stderr, "*** Second pass: Collecting features..."
    model.begin_add_event()
    for n, sentence in enumerate(iterable):
        if (n % 1000) == 0:
            print >> sys.stderr, "   {0:6d} sentences...".format(n)
        words, poses, labels = map(list, zip(*sentence))
        for i in xrange(len(labels)):
            features = compute_features(data, words, poses, i,
                                        labels[i - 1] if i >= 1 else "^")
            features = list(features)
            model.add_event(features, labels[i])
            for feature in features:
                data["feature_set"].add(feature)
    model.end_add_event(options.cutoff)
    print >> sys.stderr, "*** Collected {0} features.".format(
        len(data["feature_set"]))

    print >> sys.stderr, "*** Training..."
    maxent.set_verbose(1)
    model.train(options.iterations, options.technique, options.gaussian)
    maxent.set_verbose(0)

    print >> sys.stderr, "*** Saving..."
    model.save(options.model + ".maxent")
    with open(options.model + ".data", "w") as handle:
        cPickle.dump(data, handle)
Exemplo n.º 10
0
def test():
    maxent.set_verbose(1)

    m = MaxentModel()

    m.begin_add_event()
    m.add_event(['1'], '1')
    m.add_event(['2'], '2')
    m.add_event(['3'], '3')
    m.end_add_event()

    m.train(30, 'lbfgs', 2, 1e-03)

    for x in map(str, range(1,4)):
        print "tested on:", x, "predicted:", m.eval_all([x])
Exemplo n.º 11
0
    def trainOn(self, train_groups):
        ''' Train on the train set and return the trained model '''
        maxent.set_verbose(1)

        m = MaxentModel()

        m.begin_add_event()

        for pair in train_groups:
            m.add_event(pair[0], pair[1])

        m.end_add_event()

        m.train(20, 'lbfgs', 1e-04, 1e-03)

        return m
Exemplo n.º 12
0
def training(feature_file_path, trained_model_file, times):
  m = MaxentModel()
  fr = codecs.open(feature_file_path, 'r', 'utf-8')
  all_list = []
  m.begin_add_event()
  for line in fr:
    line = line.rstrip()
    line_list = line.split(' ')
    str_list = []
    for item in line_list:
      str_list.append(item.encode('utf-8'))
    all_list.append(str_list)
    m.add_event(str_list[1:], str_list[0], 1)
  m.end_add_event()
  print 'begin training'
  m.train(times, "lbfgs")
  print 'end training'
  m.save(trained_model_file)
  return all_list
Exemplo n.º 13
0
def training(feature_file_path, trained_model_file, times):
    m = MaxentModel()
    fin = codecs.open(feature_file_path, 'r', 'utf-8')
    all_list = []
    m.begin_add_event()
    for line in fin:
        line = line.rstrip()
        line_list = line.split(' ')
        str_list = []
        for item in line_list:
            str_list.append(item.encode('utf-8'))
        all_list.append(str_list)
        m.add_event(str_list[1:], str_list[0], 1)
    m.end_add_event()
    print 'begin training'
    m.train(times, "lbfgs")
    print 'end training'
    m.save(trained_model_file)
    return all_list
Exemplo n.º 14
0
    def trainOn(self, train_groups, n_itr = 15, var = 1, tol = 1e-5):
        ''' Train on the train set and return the trained model '''

        print "training set:", Counter(zip(*train_groups)[1]).most_common()

        maxent.set_verbose(1)

        m = MaxentModel()

        m.begin_add_event()

        for pair in train_groups:
            m.add_event(pair[0], pair[1])

        n_cutoff = 1
        m.end_add_event(n_cutoff)

        m.train(n_itr, 'lbfgs', var, tol)

        return m
Exemplo n.º 15
0
def main():
  if len(sys.argv) != 4:
    print "Usage: MaxentTrain.py features.mat labels.mat modelName"
    sys.exit(1)
  
  features = featureMatrice(sys.argv[1])
  labels = labelLst(sys.argv[2])
  
  model = MaxentModel()
  # add data into model
  model.begin_add_event()
  for i in range(len(labels)):
    model.add_event(features[i], str(labels[i]), 1)
  
  model.end_add_event()
  
  # start training
  #model.train()
  model.train(1000, "gis", 2)
  #model.train(30, "lbfgs")
  
  # save the model
  model.save(sys.argv[3])
Exemplo n.º 16
0
def main():
    if len(sys.argv) != 4:
        print "Usage: MaxentTrain.py features.mat labels.mat modelName"
        sys.exit(1)

    features = featureMatrice(sys.argv[1])
    labels = labelLst(sys.argv[2])

    model = MaxentModel()
    # add data into model
    model.begin_add_event()
    for i in range(len(labels)):
        model.add_event(features[i], str(labels[i]), 1)

    model.end_add_event()

    # start training
    #model.train()
    model.train(1000, "gis", 2)
    #model.train(30, "lbfgs")

    # save the model
    model.save(sys.argv[3])
Exemplo n.º 17
0
def main ():
    global feat_dict,me
    # parsing options{{{
    usage = "usage: %prog [options] model"
    parser = OptionParser(usage)
    parser.add_option("-f", "--file", type="string", dest="filename",
                    metavar="FILE",
                    help="train a ME model with data from FILE")
    parser.add_option("--heldout", type = "string" , metavar="FILE", 
            help="use heldout events from FILE")
    parser.add_option("--extract", type = "string", metavar="FILE", 
            help="extract training data to file")
    parser.add_option("--events_out", type="string",
            help="write training(heldout) events to file")
    parser.add_option("-c", "--cutoff", type="int", default=10,
            help="discard feature with frequency < CUTOFF when training\
            [default=10]")
    parser.add_option("-r", "--rare", type="int", default=5, 
            help="use special feature for rare word with frequency < RARE \
            [default=5]")
    parser.add_option("-g", "--gaussian", type="float", default=0.0, 
            help="apply Gaussian penality when training \
            [default=0.0]")
    parser.add_option("-b", "--binary", action="store_true", default=0, 
            help="save events in binary format for fast loading [default=off]")
    parser.add_option("--ev_cutoff", type="int", default=1,
            help="discard event with frequency < CUTOFF when training \
            [default=1]")
    parser.add_option("--iters", type="int", default=15,
                    help="how many iterations are required for training[default=15]")

    parser.add_option("-T","--type",  type="int", default=None, 
            help="choose context type [default for English]")
    (options, args) = parser.parse_args()
    #}}}

    if options.filename:
        file = open(options.filename)
    else:
        print 'training file not given'
        parser.print_usage()
        sys.exit(1)

    if len(args) !=1:
        print >> sys.stderr, 'model name not given'
        parser.print_usage()
        sys.exit(1)
    model_name = args[0]

    global rare_freq
    rare_freq = options.rare

    global get_context
    
    get_context = postagger.choose_context(options.type)

    # First pass: gather word frequency information {{{
    print 'First pass: gather word frequency information'
    gather_word_freq(file)
    print '%d words found in training data' % len(word_freq)
    word_freq_file = options.filename + '.wordfreq'
    print 'Saving word frequence information to %s' % col(word_freq_file,
    'lgreen')
    save_word_freq(word_freq_file)
    print
    # }}}

    # Second pass: gather features and tag dict {{{
    file.seek(0)
    print 'Second pass: gather features and tag dict to be used in tagger'
    print 'feature cutoff:%d' % options.cutoff
    print 'rare word freq:%d' % options.rare
    extract_feature(file, gather_feature)
    print '%d features found' % len(feat_dict)
    print '%d words found in pos dict' % len(tag_dict)
    print 'Applying cutoff %d to features' % options.cutoff
    cutoff_feature(options.cutoff, options.rare)
    print '%d features remained after cutoff' % len(feat_dict)
    feature_file = model_name + '.features'
    print 'saving features to file %s' % feature_file
    save_features(feature_file)
#    tag_dict_file = options.filename + '.tagdict'
#    print 'Saving tag dict to file %s' % (col(tag_dict_file, 'lgreen'))
#    save_tag_dict(tag_dict_file)
    tagdict_file = model_name + '.tagdict'
    print 'Saving tag dict object to %s' % col(tagdict_file, 'lgreen'), 
    import cPickle
    cPickle.dump(tag_dict, open(tagdict_file,'w'))
    print 'done'
    #}}}

    if options.extract:
        global training_data
        training_data = open(options.extract, 'w')
        print 'Saving training data to %s' % options.extract
        file.seek(0)
        extract_feature(file, save_training_data)
        sys.exit(0)

    # Third pass:training ME model...{{{
    print 'Third pass:training ME model...'
    me = MaxentModel()
    me.begin_add_event()
    file.seek(0)
    extract_feature(file, add_event)
    #import profile
    #profile.run('me.end_training()','proflog')
    if options.heldout:
        raise 'not tested'
        print 'adding heldout events from %s' % col(options.heldout, 'yellow')
        extract_feature(open(options.heldout), add_heldout_event, True)
    me.end_add_event(options.ev_cutoff)
    if options.events_out:
        raise 'not tested'
        print 'dumping training events to', col(options.events_out, 'lgreen')
#        import hotshot,  hotshot.stats
#        prof = hotshot.Profile("dump_events.prof", 1)
#        prof.runcall(me.dump_events, options.events_out)
        me.dump_events(options.events_out, options.binary)
        sys.exit(0)

    me.train(options.iters, 'lbfgs', options.gaussian)
    
    print 'training finished'

    print 'saving tagger model to %s' % model_name,
    me.save(model_name)
    print 'done'
Exemplo n.º 18
0
    return path[::-1] # Посчитать max_y P(y|x)


SUN = 'sun'
RAIN = 'rain'
train_data = [(SUN, 10),(SUN,8),(SUN,11), (RAIN,3),(RAIN,2),(SUN,6),(SUN,10),(RAIN,1)]


labels_train = [ i[0] for i in train_data]
icecream_train = [ i[1] for i in train_data]


me = MaxentModel()

me.begin_add_event()

for i,data in enumerate( train_data ):
    features = list(compute_features( icecream_train,  i , labels_train[ i - 1] if i > 0 else None )  )
    me.add_event(features, labels_train[i] )
me.end_add_event()

me.train()


Y = set([ SUN, RAIN ])


print eval_model_sentence( observations = [1,6,1,6], model = me)
print get_viterbi_path_memm( me = me, x = [1,6,1,6], Y= Y )
class MMEMAlgorithm(object):

    #реализация алгоритма на основе HMM
    def __init__(self,compute_features, N_filter_func = N_default):
        self.filter_func = N_filter_func
        self.me = MaxentModel()
        self.num_train_iters = 2000
	self.compute_features = compute_features

    def load_memm_model(self, filename):
        self.me.load( filename  )

    def init(self):
        pass

    


    def train_model_file_list(self, corpus_filelist, ambiguity_dir ):
        self.me.begin_add_event()

        for corpus_file in corpus_filelist:
            print "Training on file {0}".format( corpus_file )
            sentence = []
            morph_analys_file = os.path.join( ambiguity_dir, os.path.basename( corpus_file ) )

            morph_analys_tokens = get_tokens_from_file(morph_analys_file, N_filter_func = self.filter_func ) if os.path.exists( morph_analys_file ) else None
            if morph_analys_tokens:
                print "Using mystem features on file {0}".format( morph_analys_file )

            gold_tokens = get_tokens_from_file(corpus_file, N_filter_func = self.filter_func )
            for corpus_token in gold_tokens:

                morph_analys_token = morph_analys_tokens.next() if morph_analys_tokens else None


                gold_token_word = corpus_token[0].word
                morph_analys_token_word = morph_analys_token[0].word if morph_analys_token else None
                if morph_analys_token_word:
                    if gold_token_word != morph_analys_token_word:
                        '''
                        if ('-' in gold_token_word and '-' not in morph_analys_token_word) or ('\'' in gold_token_word and '\'' not in morph_analys_token_word):
                            morph_analys_token = morph_analys_tokens.next()
                        if ('.' in gold_token_word):
                            cnt_dots = '.'.count( gold_token_word )
                            for i in xrange( 0, cnt_dots ):
                                morph_analys_token = morph_analys_tokens.next()
                        '''
                        print >>sys.stderr, u"Start skipping sentence. Gold token wordform {0} morph token wordform {1}".format( gold_token_word, morph_analys_token_word )

                        sentence = []
                        try:
                            next_gold = gold_tokens.next()
                            while( next_gold !=  [EOS_TOKEN] ):
                                next_gold = gold_tokens.next()

                            next_gold = gold_tokens.next()
                            next_morph = morph_analys_tokens.next()
                            while( next_morph[0].word != next_gold[0].word ):
                                next_morph = morph_analys_tokens.next()

                        except StopIteration:
                            break



                if corpus_token[0] == EOS_TOKEN and len(sentence) > 0:
                    words = [token[0].word for token in sentence]
                    labels = [token[0].gram for token in sentence]
                    for i,token_info in enumerate( sentence ):
                        gold_token = token_info[0]
                        morph_analysises = [token.gram for token in token_info[1]] if token_info[1] and morph_analys_token else None

                        if token_info[1] is not None:
                            if gold_token.word != token_info[1][0].word:
                                print >>sys.stderr, u"Cannot match gold token and morph analysis token\n gold token : {0}     morph analysis token : {1}".format( gold_token.word, token_info[1][0].word )
                                morph_analysises = None

                        word_features = list( self.compute_features( sentence = words, i = i , prev_label= labels[ i - 1 ] if i >0 else None, analysises = morph_analysises, labels = labels) )
                        gold_token_gram = gold_token.gram.encode('utf-8')
                        self.me.add_event(word_features, gold_token_gram )
                    sentence = []
                else:
                    sentence.append( (corpus_token[0], morph_analys_token)  )


        self.me.end_add_event()
        maxent.set_verbose(1)
        self.me.train( self.num_train_iters, 'lbfgs', 0.0 )
        maxent.set_verbose(0)

    def train_model(self, corpus_dir, ambiguity_dir ):
        self.me.begin_add_event()
        #self.B = train_B_corpus(corpus_dir = corpus_dir,N_filter_func = N_filter_func)
        sentence = []

        corpus_files = get_corpus_files(corpus_dir)
        for corpus_file in corpus_files:

            morph_analys_file = os.path.join( ambiguity_dir, os.path.basename( corpus_file ) )
            morph_analys_tokens = get_tokens_from_file(morph_analys_file, N_filter_func = self.filter_func )

            for corpus_token in get_tokens_from_file(corpus_file, N_filter_func = self.filter_func ):

                morph_analys_token = morph_analys_tokens.next()
                if corpus_token[0] == EOS_TOKEN:
                    words = [token[0].word for token in sentence]
                    labels = [token[0].gram for token in sentence]
                    for i,token_info in enumerate( sentence ):
                        gold_token = token_info[0]
                        morph_analysises = [token.gram for token in token_info[1]]
                        if gold_token.word != token_info[1][0].word:
                            print >>sys.stderr, u"Cannot match gold token and morph analysis token\n gold token : {0}     morph analysis token : {1}".format( gold_token.word, token_info[1][0].word )
                            morph_analysises = None
                        word_features = list( self.compute_features( sentence = words, i = i , prev_label= labels[ i - 1 ] if i >0 else None, analysises = morph_analysises, labels = labels) )
                        gold_token_gram = gold_token.gram.encode('utf-8')
                        self.me.add_event(word_features, gold_token_gram )
                    sentence = []
                else:
                    sentence.append( (corpus_token[0], morph_analys_token)  )

        self.me.end_add_event()
        maxent.set_verbose(1)
        self.me.train( 50, 'lbfgs', 0.0 )
        maxent.set_verbose(0)

    def load_model(self, memm_filename):
        self.me.load( memm_filename )

    def save_model(self, memm_filename):
        self.me.save( memm_filename )
        #dump_object( B_stat_filename, self.B )

    def remove_ambiguity_file(self, file, outfile):
        out_f =  codecs.open( outfile, 'w', 'utf-8' )
        sentence = []
        for token in get_tokens_from_file(file, N_filter_func= self.filter_func):
            if len(token) == 1 and token[0] == EOS_TOKEN:
                if len(sentence)>0:
                    no_ambig_tokens = self.remove_ambiguity( sentence )
                    for no_ambig_token in no_ambig_tokens:
                        out_f.write( u"{0}\t{1}={2}\r\n".format(no_ambig_token[0], 'nolemma', no_ambig_token[1] ) )
                    out_f.write('\r\n')
                    sentence = []
                    continue
                else:
                    sentence = []
                    continue

            sentence.append( (token[0].word, token) )
        out_f.close()

    def remove_ambiguity_dir(self, dir):
        pass

    def remove_ambiguity(self, variants):
        """
        Структура variants = [ (word_form, [tokens ]), (...) , (  ) ]
        """
        words = [variant[0]  for variant in variants]
        analysises = [[token.gram for token in variant[1]]  for variant in variants ]
        viterbi_layers = [ None for i in xrange(len(words)) ]

        viterbi_backpointers = [ None for i in xrange(len(words) + 1) ]

        # Compute first layer directly.
        viterbi_layers[0] = self.me.eval_all(list(self.compute_features(sentence=words, i = 0 , prev_label= None, analysises = analysises[0], labels = None ) ) )

        filtered_viterbi_layer = dict( (k, v) for k, v in viterbi_layers[0] if k in analysises[0] )
        viterbi_layer_0_prob = sum( [v for v in filtered_viterbi_layer.values() ]  )
        viterbi_layers[0] = dict( (k, math.log(v/viterbi_layer_0_prob) ) for k, v in filtered_viterbi_layer.items() )


        viterbi_backpointers[0] = dict( (k, None) for k, v in viterbi_layers[0].iteritems() )

        # Compute intermediate layers.
        for i in xrange(1, len(words)):
            viterbi_layers[i] = defaultdict(lambda: float("-inf"))
            viterbi_backpointers[i] = defaultdict(lambda: None)
            for prev_label, prev_logprob in viterbi_layers[i - 1].iteritems():
                features = self.compute_features(sentence=words,i= i, prev_label= prev_label, analysises = analysises[i], labels = None)
                features = list(features)
                distribution =  self.me.eval_all(features)
                distribution = dict( (label, prob) for label, prob in  distribution if label in analysises[i])

                distribution_sum = sum( [v for v in distribution.values() ]  )
                distribution = dict( (k, v/ distribution_sum) for k, v in distribution.items() )
                for label, prob in distribution.items():
                    logprob = math.log(prob)
                    if prev_logprob + logprob > viterbi_layers[i][label]:
                        viterbi_layers[i][label] = prev_logprob + logprob
                        viterbi_backpointers[i][label] = prev_label

        # Most probable endpoint.
        max_logprob = float("-inf")
        max_label = None
        for label, logprob in viterbi_layers[len(words) - 1].iteritems():
            if logprob > max_logprob:
                max_logprob = logprob
                max_label = label

        # Most probable sequence.
        path = []
        label = max_label
        for i in reversed(xrange(len(words))):
            path.insert(0, label)
            try:
                label = viterbi_backpointers[i][label]
            except KeyError:
                pass

        return zip(words,path)
Exemplo n.º 20
0
def main():
    global feat_dict, me
    # parsing options{{{
    usage = "usage: %prog [options] model"
    parser = OptionParser(usage)
    parser.add_option("-f",
                      "--file",
                      type="string",
                      dest="filename",
                      metavar="FILE",
                      help="train a ME model with data from FILE")
    parser.add_option("--heldout",
                      type="string",
                      metavar="FILE",
                      help="use heldout events from FILE")
    parser.add_option("--extract",
                      type="string",
                      metavar="FILE",
                      help="extract training data to file")
    parser.add_option("--events_out",
                      type="string",
                      help="write training(heldout) events to file")
    parser.add_option(
        "-c",
        "--cutoff",
        type="int",
        default=10,
        help="discard feature with frequency < CUTOFF when training\
            [default=10]")
    parser.add_option(
        "-r",
        "--rare",
        type="int",
        default=5,
        help="use special feature for rare word with frequency < RARE \
            [default=5]")
    parser.add_option("-g",
                      "--gaussian",
                      type="float",
                      default=0.0,
                      help="apply Gaussian penality when training \
            [default=0.0]")
    parser.add_option(
        "-b",
        "--binary",
        action="store_true",
        default=0,
        help="save events in binary format for fast loading [default=off]")
    parser.add_option(
        "--ev_cutoff",
        type="int",
        default=1,
        help="discard event with frequency < CUTOFF when training \
            [default=1]")
    parser.add_option(
        "--iters",
        type="int",
        default=15,
        help="how many iterations are required for training[default=15]")

    parser.add_option("-T",
                      "--type",
                      type="int",
                      default=None,
                      help="choose context type [default for English]")
    (options, args) = parser.parse_args()
    #}}}

    if options.filename:
        file = open(options.filename)
    else:
        print 'training file not given'
        parser.print_usage()
        sys.exit(1)

    if len(args) != 1:
        print >> sys.stderr, 'model name not given'
        parser.print_usage()
        sys.exit(1)
    model_name = args[0]

    global rare_freq
    rare_freq = options.rare

    global get_context

    get_context = postagger.choose_context(options.type)

    # First pass: gather word frequency information {{{
    print 'First pass: gather word frequency information'
    gather_word_freq(file)
    print '%d words found in training data' % len(word_freq)
    word_freq_file = options.filename + '.wordfreq'
    print 'Saving word frequence information to %s' % col(
        word_freq_file, 'lgreen')
    save_word_freq(word_freq_file)
    print
    # }}}

    # Second pass: gather features and tag dict {{{
    file.seek(0)
    print 'Second pass: gather features and tag dict to be used in tagger'
    print 'feature cutoff:%d' % options.cutoff
    print 'rare word freq:%d' % options.rare
    extract_feature(file, gather_feature)
    print '%d features found' % len(feat_dict)
    print '%d words found in pos dict' % len(tag_dict)
    print 'Applying cutoff %d to features' % options.cutoff
    cutoff_feature(options.cutoff, options.rare)
    print '%d features remained after cutoff' % len(feat_dict)
    feature_file = model_name + '.features'
    print 'saving features to file %s' % feature_file
    save_features(feature_file)
    #    tag_dict_file = options.filename + '.tagdict'
    #    print 'Saving tag dict to file %s' % (col(tag_dict_file, 'lgreen'))
    #    save_tag_dict(tag_dict_file)
    tagdict_file = model_name + '.tagdict'
    print 'Saving tag dict object to %s' % col(tagdict_file, 'lgreen'),
    import cPickle
    cPickle.dump(tag_dict, open(tagdict_file, 'w'))
    print 'done'
    #}}}

    if options.extract:
        global training_data
        training_data = open(options.extract, 'w')
        print 'Saving training data to %s' % options.extract
        file.seek(0)
        extract_feature(file, save_training_data)
        sys.exit(0)

    # Third pass:training ME model...{{{
    print 'Third pass:training ME model...'
    me = MaxentModel()
    me.begin_add_event()
    file.seek(0)
    extract_feature(file, add_event)
    #import profile
    #profile.run('me.end_training()','proflog')
    if options.heldout:
        raise 'not tested'
        print 'adding heldout events from %s' % col(options.heldout, 'yellow')
        extract_feature(open(options.heldout), add_heldout_event, True)
    me.end_add_event(options.ev_cutoff)
    if options.events_out:
        raise 'not tested'
        print 'dumping training events to', col(options.events_out, 'lgreen')
        #        import hotshot,  hotshot.stats
        #        prof = hotshot.Profile("dump_events.prof", 1)
        #        prof.runcall(me.dump_events, options.events_out)
        me.dump_events(options.events_out, options.binary)
        sys.exit(0)

    me.train(options.iters, 'lbfgs', options.gaussian)

    print 'training finished'

    print 'saving tagger model to %s' % model_name,
    me.save(model_name)
    print 'done'
Exemplo n.º 21
0
def train_model(options, iterable):
    model = MaxentModel()
    data = {}
    

    data["feature_set"] = set()
    data["word_frequencies"] = defaultdict(long)
    # XXX(sandello): defaultdict(lambda: defaultdict(long)) would be
    # a better choice here (for |labelled_words|) but it could not be pickled.
    # C'est la vie.
    data["labelled_words"] = dict()
    data["unigrams"] = dict()

    
    print >>sys.stderr, "*** Training options are:"
    print >>sys.stderr, "   ", options

    print >>sys.stderr, "*** First pass: Computing statistics..."
    
    unigrams = dict()
    unigrams["B-ORG"] = defaultdict(long)
    unigrams["B-MISC"] = defaultdict(long)
    unigrams["B-LOC"] = defaultdict(long)
    unigrams["B-PER"] = defaultdict(long)

    for n, sentence in enumerate(iterable):
        if (n % 1000) == 0:
            print >>sys.stderr, "   {0:6d} sentences...".format(n)
        previous_word = "^"
        previous_label = "^"
        for word, pos, label in sentence:
            data["word_frequencies"][string.lower(word)] += 1
            if label.startswith("B-") or label.startswith("I-"):
                if word in data["labelled_words"]:
                    data["labelled_words"][string.lower(word)][label] += 1
                else:
                    data["labelled_words"][string.lower(word)] = defaultdict(long)
                    data["labelled_words"][string.lower(word)][label] = 1
            if label.startswith("B-") and (previous_word != "^"):
                unigrams[label][string.lower(previous_word)] += 1
                
            previous_label = label
            previous_word = word
    
    unigram_counters = [Counter(unigrams[key]) for key in unigrams]
    total_count = Counter()
    for counter in unigram_counters:
         total_count += counter

    total_count = dict(total_count)
    inv_total_freq  = dict([[key, (math.log(sum(total_count.values()) /  total_count[key]) ** 3)] for key in total_count])
    
    for label in unigrams:
        all_sum = sum([unigrams[label][word] for word in unigrams[label]])
        uni = sorted([[(1.0 * unigrams[label][word] * inv_total_freq[word] / all_sum ), word] for word in unigrams[label]])
        uni = [word[1] for word in uni]
        data["unigrams"][label] = uni[-50:]
        # print >>sys.stderr, "*** Collected {0} unigrams for {1}".format(len(data["unigrams"][label]), label)

    print >>sys.stderr, "*** Second pass: Collecting features..."
    model.begin_add_event()
    for n, sentence in enumerate(iterable):
        if (n % 1000) == 0:
            print >>sys.stderr, "   {0:6d} sentences...".format(n)
        words, poses, labels = map(list, zip(*sentence))
        for i in xrange(len(labels)):
            features = compute_features(data, words, poses, i, labels[i - 1] if i >= 1 else "^")
            features = list(features)
            model.add_event(features, labels[i])
            for feature in features:
                data["feature_set"].add(feature)
    model.end_add_event(options.cutoff)
    print >>sys.stderr, "*** Collected {0} features.".format(len(data["feature_set"]))

    print >>sys.stderr, "*** Training..."
    maxent.set_verbose(1)
    model.train(options.iterations, options.technique, options.gaussian)
    maxent.set_verbose(0)

    print >>sys.stderr, "*** Saving..."
    model.save(options.model + ".maxent")
    with open(options.model + ".data", "w") as handle:
        cPickle.dump(data, handle)
Exemplo n.º 22
0
from maxent import MaxentModel

for i in range(5):
    m = MaxentModel()
    context = []
    m.begin_add_event()
    with open('contexts/contexts' + str(i + 1) + '.txt', 'r') as f:
        for line in f:
            line = line.rstrip()
            try:
                ind = line.index(':')
                if line[:ind] != '':
                    rel = line[:ind]
                    l = eval(line[ind + 1:])
                    m.add_event(l, rel, 1)
            except:
                pass
    m.end_add_event()

    m.train(100, 'lbfgs')
    s_name = "models/lbfgs/model" + str(i + 1)
    m.save(s_name)