def eval_model(options, iterable): model = MaxentModel() data = {} # ne_labels = eval_ne_binary_model(options, iterable) print >>sys.stderr, "*** Loading..." model.load(options.model + ".maxent") with open(options.model + ".data", "r") as handle: data = cPickle.load(handle) print >>sys.stderr, "*** Evaluating..." data["sentences"] = iterable for n, sentence in enumerate(iterable): if (n % 100) == 0: print >>sys.stderr, " {0:6d} sentences...".format(n) words, poses = map(list, zip(*sentence)) # data["ne_labels"] = ne_labels[n] data["sentence_number"] = n data["double_quotes"] = False data["single_quotes"] = False labels = eval_model_sentence(options, data, model, words, poses) for word, pos, label in zip(words, poses, labels): print label print
def tag_test(test_feature_file, trained_model_file, tag_test_set_file): fin = codecs.open(test_feature_file, 'r', 'utf-8') fout = codecs.open(tag_test_set_file, 'w', 'utf-8') m = MaxentModel() m.load(trained_model_file) contents = fin.read() feature_list = contents.split('\r') feature_list.remove('\n') for feature in feature_list: if (feature == 'split'): fout.write('\n\n\n') continue str_feature = [] u_feature = feature.split(' ') for item in u_feature: str_feature.append(item.encode('utf-8')) label_prob_list = m.eval_all(str_feature) label = max_prob(label_prob_list) try: new_tag = str_feature[2].split('=')[1] + '/' + label except IndexError: print str_feature fout.write(new_tag.decode('utf-8')) pre_tag = label return feature_list
def eval_model(options, iterable): model = MaxentModel() data = {} print >>sys.stderr, "*** Loading..." model.load(options.model + ".maxent") with open(options.model + ".data", "r") as handle: data = cPickle.load(handle) print >>sys.stderr, "*** Evaluating..." for n, sentence in enumerate(iterable): if (n % 100) == 0: print >>sys.stderr, " {0:6d} sentences...".format(n) words, poses = map(list, zip(*sentence)) labels = eval_model_sentence(options, data, model, words, poses) ## some post-proccessing for remove sequences: O I-ORG O previous_label = '^' for i in xrange(0, len(words)): label = labels[i] if (label.startswith('I-')) and ((previous_label == 'O') or (previous_label == '^')): label = 'B' + label[1:] # if (i + 1 < len(words)) and (labels[i + 1] != 'O') and (labels[i] != 'O') and (labels[i + 1][0] != 'B') and (labels[i + 1][2:] != labels[i][2:]): # label = labels[i][:1] + labels[i + 1][2:] print label previous_label = label print
def tag_test(test_feature_file, trained_model_file, tag_test_set_file): fr = codecs.open(test_feature_file, 'r', 'utf-8') fw = codecs.open(tag_test_set_file, 'w', 'utf-8') m = MaxentModel() m.load(trained_model_file) contents = fr.read() feature_list = contents.split('\r') feature_list.remove('\n') for feature in feature_list: if (feature == 'split'): fw.write('\n\n\n') continue str_feature = [] u_feature = feature.split(' ') for item in u_feature: str_feature.append(item.encode('utf-8')) label_prob_list = m.eval_all(str_feature) label = max_prob(label_prob_list) try: new_tag = str_feature[2].split('=')[1] + '/' + label except IndexError: print str_feature fw.write(new_tag.decode('utf-8')) pre_tag = label return feature_list
def main(): usage = "usage: %prog [options] -m model file" parser = OptionParser(usage) parser.add_option("-i","--input",type="string", help="test data as input") parser.add_option("-o", "--output", type="string", help="write detector result to OUTPUT") parser.add_option("-m", "--model", type="string", help="load trained model from MODEL") (options, args) = parser.parse_args() global m model = options.model m = MaxentModel() m.load(model) #in_file = sys.stdin if options.input: in_file = open(options.input) else: print >> sys.stderr,'not given input test data' sys.exit(1) if len(args) >=1: tag_in_file = open(args[0]) out_file = sys.stdout if options.output: out_file = open(options.output, 'w') predict_file(in_file,out_file)
def main(): if len(sys.argv) != 2: print "Usage: MaxentTest.py modelName" sys.exit(1) model = MaxentModel() model.load(sys.argv[1]) context = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] label = model.eval(context, str(0)) #result = model.eval_all(context) print "Result: ", label
def main(): usage = "usage: %prog [options] -m model file" parser = OptionParser(usage) parser.add_option("-o", "--output", type="string", help="write tagged result to OUTPUT") parser.add_option("-m", "--model", type="string", help="load trained model from MODEL") parser.add_option("-t", "--test", action="store_true", default=0, help="test mode, include original tag in output") parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=1) parser.add_option("-q", "--quiet", action="store_false", dest="verbose") parser.add_option("-T", "--type", type="int", default=None, help="choose context type") (options, args) = parser.parse_args() if not options.model: print >> sys.stderr, 'Tagger model name not given!' parser.print_usage() sys.exit(1) model = options.model tag_dict = cPickle.load(open(model + '.tagdict')) me = MaxentModel() me.load(model) tagger = postagger.PosTagger(me, tag_dict, options.type) tag_in_file = sys.stdin if len(args) >= 1: tag_in_file = open(args[0]) tag_out_file = sys.stdout if options.output: tag_out_file = open(out, 'w') tag_file(tagger, tag_in_file, tag_out_file, options.test)
def main(): if len(sys.argv) != 2: print "Usage: MaxentTest.py modelName" sys.exit(1) model = MaxentModel() model.load(sys.argv[1]) context = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] label = model.eval(context, str(0)) #result = model.eval_all(context) print "Result: ", label
def predict_tags(best_1_name, best_1_org, best_3_name, best_5_org, sentences, f, op): rel = [ 'others', 'director', 'analyst', 'advisor', 'head', 'manager', 'spokesperson', 'founder', 'professor', 'leave', 'lawyer' ] me = MaxentModel() me.load('../training/models/lbfgs/model3') count = 0 for n1, o1, n3, o5, sent in zip(best_1_name, best_1_org, best_3_name, best_5_org, sentences): if len(n3) == 0 or len(o5) == 0: op.write(str((n1, o1)) + '\n') else: j = ('', '', '', 0.0) d = {} for name in n3: for org in o5: context = get_context(name, org, sent) relation = '' prob = 0.0 if context != None: for r in rel: y = me.eval(context, r) if y > prob: prob = y relation = r #set_r.append((name,org,relation,prob)) d[(name, org)] = relation if prob > j[3] and relation != 'others': j = (name, org, relation, prob) else: d[(name, org)] = 'others' #print str(count)+' before : '+str(n1)+'\t'+str(o1) resolve_conflicts(n1, o1, j) #print str(count)+' after : '+str(n1)+'\t'+str(o1) #x = raw_input() op.write(str((n1, o1)) + '\n') f.write(str(j) + '\n') count = count + 1
def eval_model(options, iterable): model = MaxentModel() data = {} print >>sys.stderr, "*** Loading..." model.load(options.model + ".maxent") with open(options.model + ".data", "r") as handle: data = cPickle.load(handle) print >>sys.stderr, "*** Evaluating..." for n, sentence in enumerate(iterable): if (n % 100) == 0: print >>sys.stderr, " {0:6d} sentences...".format(n) words, poses = map(list, zip(*sentence)) labels = eval_model_sentence(options, data, model, words, poses) for word, pos, label in zip(words, poses, labels): print label print
def eval_model(options, iterable): model = MaxentModel() data = {} print >> sys.stderr, "*** Loading..." model.load(options.model + ".maxent") with open(options.model + ".data", "r") as handle: data = cPickle.load(handle) print >> sys.stderr, "*** Evaluating..." for n, sentence in enumerate(iterable): if (n % 100) == 0: print >> sys.stderr, " {0:6d} sentences...".format(n) words, poses = map(list, zip(*sentence)) labels = eval_model_sentence(options, data, model, words, poses) for word, pos, label in zip(words, poses, labels): print label print
def eval_ne_binary_model(options, iterable): model = MaxentModel() data = {} predicted_labels = [] print >>sys.stderr, "*** Loading..." model.load(options.model + ".ne.binary.maxent") with open(options.model + ".ne.binary.data", "r") as handle: data = cPickle.load(handle) print >>sys.stderr, "*** Evaluating..." for n, sentence in enumerate(iterable): if (n % 100) == 0: print >>sys.stderr, " {0:6d} sentences...".format(n) words, poses = map(list, zip(*sentence)) labels = eval_ne_binary_model_sentence(options, data, model, words, poses) predicted_labels += [labels] return predicted_labels
def main (): usage = "usage: %prog [options] -m model file" parser = OptionParser(usage) parser.add_option("-o", "--output", type="string", help="write tagged result to OUTPUT") parser.add_option("-m", "--model", type="string", help="load trained model from MODEL") parser.add_option("-t", "--test", action="store_true", default=0, help="test mode, include original tag in output") parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=1) parser.add_option("-q", "--quiet", action="store_false", dest="verbose") parser.add_option("-T","--type", type="int", default=None, help="choose context type") (options, args) = parser.parse_args() if not options.model: print >> sys.stderr, 'Tagger model name not given!' parser.print_usage() sys.exit(1) model = options.model tag_dict = cPickle.load(open(model + '.tagdict')) me = MaxentModel() me.load(model) tagger = postagger.PosTagger(me, tag_dict, options.type) tag_in_file = sys.stdin if len(args) >=1: tag_in_file = open(args[0]) tag_out_file = sys.stdout if options.output: tag_out_file = open(out, 'w') tag_file(tagger, tag_in_file, tag_out_file, options.test)
class MMEMAlgorithm(object): #реализация алгоритма на основе HMM def __init__(self,compute_features, N_filter_func = N_default): self.filter_func = N_filter_func self.me = MaxentModel() self.num_train_iters = 2000 self.compute_features = compute_features def load_memm_model(self, filename): self.me.load( filename ) def init(self): pass def train_model_file_list(self, corpus_filelist, ambiguity_dir ): self.me.begin_add_event() for corpus_file in corpus_filelist: print "Training on file {0}".format( corpus_file ) sentence = [] morph_analys_file = os.path.join( ambiguity_dir, os.path.basename( corpus_file ) ) morph_analys_tokens = get_tokens_from_file(morph_analys_file, N_filter_func = self.filter_func ) if os.path.exists( morph_analys_file ) else None if morph_analys_tokens: print "Using mystem features on file {0}".format( morph_analys_file ) gold_tokens = get_tokens_from_file(corpus_file, N_filter_func = self.filter_func ) for corpus_token in gold_tokens: morph_analys_token = morph_analys_tokens.next() if morph_analys_tokens else None gold_token_word = corpus_token[0].word morph_analys_token_word = morph_analys_token[0].word if morph_analys_token else None if morph_analys_token_word: if gold_token_word != morph_analys_token_word: ''' if ('-' in gold_token_word and '-' not in morph_analys_token_word) or ('\'' in gold_token_word and '\'' not in morph_analys_token_word): morph_analys_token = morph_analys_tokens.next() if ('.' in gold_token_word): cnt_dots = '.'.count( gold_token_word ) for i in xrange( 0, cnt_dots ): morph_analys_token = morph_analys_tokens.next() ''' print >>sys.stderr, u"Start skipping sentence. Gold token wordform {0} morph token wordform {1}".format( gold_token_word, morph_analys_token_word ) sentence = [] try: next_gold = gold_tokens.next() while( next_gold != [EOS_TOKEN] ): next_gold = gold_tokens.next() next_gold = gold_tokens.next() next_morph = morph_analys_tokens.next() while( next_morph[0].word != next_gold[0].word ): next_morph = morph_analys_tokens.next() except StopIteration: break if corpus_token[0] == EOS_TOKEN and len(sentence) > 0: words = [token[0].word for token in sentence] labels = [token[0].gram for token in sentence] for i,token_info in enumerate( sentence ): gold_token = token_info[0] morph_analysises = [token.gram for token in token_info[1]] if token_info[1] and morph_analys_token else None if token_info[1] is not None: if gold_token.word != token_info[1][0].word: print >>sys.stderr, u"Cannot match gold token and morph analysis token\n gold token : {0} morph analysis token : {1}".format( gold_token.word, token_info[1][0].word ) morph_analysises = None word_features = list( self.compute_features( sentence = words, i = i , prev_label= labels[ i - 1 ] if i >0 else None, analysises = morph_analysises, labels = labels) ) gold_token_gram = gold_token.gram.encode('utf-8') self.me.add_event(word_features, gold_token_gram ) sentence = [] else: sentence.append( (corpus_token[0], morph_analys_token) ) self.me.end_add_event() maxent.set_verbose(1) self.me.train( self.num_train_iters, 'lbfgs', 0.0 ) maxent.set_verbose(0) def train_model(self, corpus_dir, ambiguity_dir ): self.me.begin_add_event() #self.B = train_B_corpus(corpus_dir = corpus_dir,N_filter_func = N_filter_func) sentence = [] corpus_files = get_corpus_files(corpus_dir) for corpus_file in corpus_files: morph_analys_file = os.path.join( ambiguity_dir, os.path.basename( corpus_file ) ) morph_analys_tokens = get_tokens_from_file(morph_analys_file, N_filter_func = self.filter_func ) for corpus_token in get_tokens_from_file(corpus_file, N_filter_func = self.filter_func ): morph_analys_token = morph_analys_tokens.next() if corpus_token[0] == EOS_TOKEN: words = [token[0].word for token in sentence] labels = [token[0].gram for token in sentence] for i,token_info in enumerate( sentence ): gold_token = token_info[0] morph_analysises = [token.gram for token in token_info[1]] if gold_token.word != token_info[1][0].word: print >>sys.stderr, u"Cannot match gold token and morph analysis token\n gold token : {0} morph analysis token : {1}".format( gold_token.word, token_info[1][0].word ) morph_analysises = None word_features = list( self.compute_features( sentence = words, i = i , prev_label= labels[ i - 1 ] if i >0 else None, analysises = morph_analysises, labels = labels) ) gold_token_gram = gold_token.gram.encode('utf-8') self.me.add_event(word_features, gold_token_gram ) sentence = [] else: sentence.append( (corpus_token[0], morph_analys_token) ) self.me.end_add_event() maxent.set_verbose(1) self.me.train( 50, 'lbfgs', 0.0 ) maxent.set_verbose(0) def load_model(self, memm_filename): self.me.load( memm_filename ) def save_model(self, memm_filename): self.me.save( memm_filename ) #dump_object( B_stat_filename, self.B ) def remove_ambiguity_file(self, file, outfile): out_f = codecs.open( outfile, 'w', 'utf-8' ) sentence = [] for token in get_tokens_from_file(file, N_filter_func= self.filter_func): if len(token) == 1 and token[0] == EOS_TOKEN: if len(sentence)>0: no_ambig_tokens = self.remove_ambiguity( sentence ) for no_ambig_token in no_ambig_tokens: out_f.write( u"{0}\t{1}={2}\r\n".format(no_ambig_token[0], 'nolemma', no_ambig_token[1] ) ) out_f.write('\r\n') sentence = [] continue else: sentence = [] continue sentence.append( (token[0].word, token) ) out_f.close() def remove_ambiguity_dir(self, dir): pass def remove_ambiguity(self, variants): """ Структура variants = [ (word_form, [tokens ]), (...) , ( ) ] """ words = [variant[0] for variant in variants] analysises = [[token.gram for token in variant[1]] for variant in variants ] viterbi_layers = [ None for i in xrange(len(words)) ] viterbi_backpointers = [ None for i in xrange(len(words) + 1) ] # Compute first layer directly. viterbi_layers[0] = self.me.eval_all(list(self.compute_features(sentence=words, i = 0 , prev_label= None, analysises = analysises[0], labels = None ) ) ) filtered_viterbi_layer = dict( (k, v) for k, v in viterbi_layers[0] if k in analysises[0] ) viterbi_layer_0_prob = sum( [v for v in filtered_viterbi_layer.values() ] ) viterbi_layers[0] = dict( (k, math.log(v/viterbi_layer_0_prob) ) for k, v in filtered_viterbi_layer.items() ) viterbi_backpointers[0] = dict( (k, None) for k, v in viterbi_layers[0].iteritems() ) # Compute intermediate layers. for i in xrange(1, len(words)): viterbi_layers[i] = defaultdict(lambda: float("-inf")) viterbi_backpointers[i] = defaultdict(lambda: None) for prev_label, prev_logprob in viterbi_layers[i - 1].iteritems(): features = self.compute_features(sentence=words,i= i, prev_label= prev_label, analysises = analysises[i], labels = None) features = list(features) distribution = self.me.eval_all(features) distribution = dict( (label, prob) for label, prob in distribution if label in analysises[i]) distribution_sum = sum( [v for v in distribution.values() ] ) distribution = dict( (k, v/ distribution_sum) for k, v in distribution.items() ) for label, prob in distribution.items(): logprob = math.log(prob) if prev_logprob + logprob > viterbi_layers[i][label]: viterbi_layers[i][label] = prev_logprob + logprob viterbi_backpointers[i][label] = prev_label # Most probable endpoint. max_logprob = float("-inf") max_label = None for label, logprob in viterbi_layers[len(words) - 1].iteritems(): if logprob > max_logprob: max_logprob = logprob max_label = label # Most probable sequence. path = [] label = max_label for i in reversed(xrange(len(words))): path.insert(0, label) try: label = viterbi_backpointers[i][label] except KeyError: pass return zip(words,path)
class BatchSBD: def __init__(self, dictpath): util.Logger.info('Initializing sbd instance...') self.tokenizer = Tokenizer.Tokenizer() self.statistics = defaultdict(int) self.dictionary = Dictionary.Dictionary(dictpath) self.dictionary.load('syllable') self.dictionary.load('token') self.dictionary.load('type') self.dictionary.load('length') self.model = MaxentModel() self.threshold = 0.0 util.Logger.info('sbd instance Initialized.') def load(self, modelname=None, threshold=0.0): util.Logger.info('Loading model...') assert(modelname != None) assert(modelname.strip() != '') try: util.Logger.debug("Started to load model...") self.model.load(modelname) self.threshold = threshold util.Logger.debug("Completed to load model '%s'" % modelname) except: raise util.Logger.info('Model loaded.') def run(self, input=None, output=None, syllable_length=1, merged_use=False): util.Logger.info('run ' + input + ',' + output) assert(input != None) assert(input.strip() != '') assert(output != None) assert(output.strip() != '') try: # load document util.Logger.info("Started to load document.") document = Document.Document() ifile = open(input) # build document util.Logger.info("Adding token to document.") self.tokenizer.clear() for token in self.tokenizer.tokenize(ifile): document.add(token) ifile.close() # detect sentence boundaries util.Logger.info("Detecting sentence boundaries.") ofile = open(output, "w+") line = '' lineno = 1 for id in range(document.length()): prev = document.prev(id) curr = document.token(id) next = document.next(id) eos = False # check every position eos = self.eval(document, id, prev, curr, next, syllable_length, merged_use) if eos == None: continue; # null field found line += curr.value if curr.isEoe(): line += ' ' if eos and len(line.strip()) > 0: if line[0:1] == ' ': ofile.write('\n') ofile.write(line.strip() + '\n') line = '' ofile.write(line.strip() + '\n') ofile.close() document.clear() util.Logger.info("Detecting '%s' document completed." % input) except: raise def eos(self, context): label = 'yes' prob = self.model.eval(context, label) buf = '' if prob >= self.threshold: return True else: return False # append property into list-buf def append_maxent_parameter(self, list, i, property): i += 1 list.append(str(i) + ':' + str(property)) return i # FIXME: code duplicattion with sbd.detector.Probabilistic.py def eval(self, document, id, prevToken, currToken, nextToken, syllable_length=0, merged_use=False): dict = self.dictionary common = util.Common() # default token value default = '_' # { pos-type, pos-name } current_pos_type = common.name_of_type(currToken) current_pos_name = common.name_of_pos(currToken) prefix_pos_type = common.name_of_type(prevToken) prefix_pos_name = common.name_of_pos(prevToken) suffix_pos_type = common.name_of_type(nextToken) suffix_pos_name = common.name_of_pos(nextToken) # { syllables } prefix_syllable_name = [] prefix_syllable_prob = [] suffix_syllable_name = [] suffix_syllable_prob = [] merged_syllable_name = [] merged_syllable_prob = [] for length in xrange(syllable_length): if prevToken.length == 0: prefixName = default * syllable_length else: prefixName = prevToken.syllable(-1*(length+1)) prefix_syllable_name.append(prefixName) prefix_syllable_prob.append(dict.getPrefixSyllableProb(prefixName)) if nextToken.length == 0: suffixName = default * syllable_length else: suffixName = nextToken.syllable(length+1) suffix_syllable_name.append(suffixName) suffix_syllable_prob.append(dict.getSuffixSyllableProb(suffixName)) if merged_use: mergedName = prefixName + '_' + suffixName merged_syllable_name.append(mergedName) merged_syllable_prob.append(dict.getMergedSyllableProb(mergedName)) # { token-name, token-prob } if currToken.length == 0: current_token_name = default else: current_token_name = currToken.value current_token_prob = dict.getCurrentTokenProb(current_token_name) if prevToken.length == 0: prefix_token_name = default else: prefix_token_name = prevToken.value prefix_token_prob = dict.getPrefixTokenProb(prefix_token_name) if nextToken.length == 0: suffix_token_name = default else: suffix_token_name = nextToken.value suffix_token_prob = dict.getSuffixTokenProb(suffix_token_name) # { candidate-distance } prefix_candidate_dist = document.prevCandidateDist(id) suffix_candidate_dist = document.nextCandidateDist(id) # { punctuation-distance } prefix_punctuation_dist = document.prevPunctuationDist(id) suffix_punctuation_dist = document.nextPunctuationDist(id) # { token-length } current_token_length = currToken.length prefix_token_length = prevToken.length suffix_token_length = nextToken.length # { end-of-sentence } end_of_sentence = 'no' if currToken.end_of_sentence: end_of_sentence = 'yes' context = [end_of_sentence] i = 0 # { building instances } i = self.append_maxent_parameter(context, i, current_pos_type) i = self.append_maxent_parameter(context, i, current_pos_name) i = self.append_maxent_parameter(context, i, prefix_pos_type) i = self.append_maxent_parameter(context, i, prefix_pos_name) i = self.append_maxent_parameter(context, i, suffix_pos_type) i = self.append_maxent_parameter(context, i, suffix_pos_name) # XXX: maxent use NAME instead of PROBABILITY for length in xrange(syllable_length): i = self.append_maxent_parameter(context, i, prefix_syllable_name[length]) i = self.append_maxent_parameter(context, i, suffix_syllable_name[length]) if merged_use: i = self.append_maxent_parameter(context, i, merged_syllable_name[length]) i = self.append_maxent_parameter(context, i, current_token_name) i = self.append_maxent_parameter(context, i, prefix_token_name) i = self.append_maxent_parameter(context, i, suffix_token_name) i = self.append_maxent_parameter(context, i, str(current_token_length)) i = self.append_maxent_parameter(context, i, str(prefix_token_length)) i = self.append_maxent_parameter(context, i, str(suffix_token_length)) eos = self.eos(context) return eos def calc(self, answer, rule): if answer == True and rule == True: result = 'TP' elif answer == True and rule == False: result = 'TN' elif answer == False and rule == True: result = 'FP' else: result = 'FN' self.statistics[result] += 1 def summary(self): precision = 0.0 recall = 0.0 fscore = 0.0 tp = self.statistics['TP'] tn = self.statistics['TN'] fp = self.statistics['FP'] util.Logger.info("tp:", tp, "tn:", tn, "fp:", fp) if (tp + tn) > 0: precision = tp * 1.0 / (tp + tn) if (tp + fp) > 0: recall = tp * 1.0 / (tp + fp) if (precision+recall) > 0: fscore = (2*precision*recall) / (precision+recall) util.Logger.info("Precision:\t%0.3f%%" % (precision * 100.0)) util.Logger.info("Recall:\t\t%0.3f%%" % (recall * 100.0)) util.Logger.info("Fscore:\t\t%0.3f%%" % (fscore * 100.0))
class MaxentBasedSBD: def __init__(self, dictpath): self.tokenizer = Tokenizer.Tokenizer() self.documents = defaultdict(Document.Document) self.statistics = defaultdict(int) self.dictionary = Dictionary.Dictionary(dictpath) self.dictionary.load('syllable') self.dictionary.load('token') self.dictionary.load('type') self.dictionary.load('length') self.model = MaxentModel() self.threshold = 0.0 def set(self, modelname=None, threshold=0.0, filename=None): assert(modelname != None) assert(modelname.strip() != '') assert(filename != None) assert(filename.strip() != '') try: util.Logger.debug("Started to load model...") self.model.load(modelname) self.threshold = threshold util.Logger.debug("Completed to load model '%s'" % modelname) except: raise try: util.Logger.debug("Started to load document...") document = Document.Document() file = open(filename) for token in self.tokenizer.tokenize(file): document.add(token) file.close() self.documents[filename] = document util.Logger.debug("Competed to load document '%s'" % filename) except: raise def get(self, filename=None): assert(filename != None) assert(filename.strip() != '') if filename in self.documents: return self.documents[filename] else: return Document.Document() def eos(self, context): label = 'yes' prob = self.model.eval(context, label) buf = '' if prob >= self.threshold: return True else: return False # append property into list-buf def append_maxent_parameter(self, list, i, property): i += 1 list.append(str(i) + ':' + str(property)) return i # FIXME: code duplicattion with sbd.detector.Probabilistic.py def eval(self, document, id, prevToken, currToken, nextToken, syllable_length=0, merged_use=False): dict = self.dictionary common = util.Common() # default token value default = '_' # { pos-type, pos-name } current_pos_type = common.name_of_type(currToken) current_pos_name = common.name_of_pos(currToken) prefix_pos_type = common.name_of_type(prevToken) prefix_pos_name = common.name_of_pos(prevToken) suffix_pos_type = common.name_of_type(nextToken) suffix_pos_name = common.name_of_pos(nextToken) # { syllables } prefix_syllable_name = [] prefix_syllable_prob = [] suffix_syllable_name = [] suffix_syllable_prob = [] merged_syllable_name = [] merged_syllable_prob = [] for length in xrange(syllable_length): if prevToken.length == 0: prefixName = default * syllable_length else: prefixName = prevToken.syllable(-1*(length+1)) prefix_syllable_name.append(prefixName) prefix_syllable_prob.append(dict.getPrefixSyllableProb(prefixName)) if nextToken.length == 0: suffixName = default * syllable_length else: suffixName = nextToken.syllable(length+1) suffix_syllable_name.append(suffixName) suffix_syllable_prob.append(dict.getSuffixSyllableProb(suffixName)) if merged_use: mergedName = prefixName + '_' + suffixName merged_syllable_name.append(mergedName) merged_syllable_prob.append(dict.getMergedSyllableProb(mergedName)) # { token-name, token-prob } if currToken.length == 0: current_token_name = default else: current_token_name = currToken.value current_token_prob = dict.getCurrentTokenProb(current_token_name) if prevToken.length == 0: prefix_token_name = default else: prefix_token_name = prevToken.value prefix_token_prob = dict.getPrefixTokenProb(prefix_token_name) if nextToken.length == 0: suffix_token_name = default else: suffix_token_name = nextToken.value suffix_token_prob = dict.getSuffixTokenProb(suffix_token_name) # { candidate-distance } prefix_candidate_dist = document.prevCandidateDist(id) suffix_candidate_dist = document.nextCandidateDist(id) # { punctuation-distance } prefix_punctuation_dist = document.prevPunctuationDist(id) suffix_punctuation_dist = document.nextPunctuationDist(id) # { token-length } current_token_length = currToken.length prefix_token_length = prevToken.length suffix_token_length = nextToken.length # { end-of-sentence } end_of_sentence = 'no' if currToken.end_of_sentence: end_of_sentence = 'yes' context = [end_of_sentence] i = 0 # { building instances } i = self.append_maxent_parameter(context, i, current_pos_type) i = self.append_maxent_parameter(context, i, current_pos_name) i = self.append_maxent_parameter(context, i, prefix_pos_type) i = self.append_maxent_parameter(context, i, prefix_pos_name) i = self.append_maxent_parameter(context, i, suffix_pos_type) i = self.append_maxent_parameter(context, i, suffix_pos_name) # XXX: maxent use NAME instead of PROBABILITY for length in xrange(syllable_length): i = self.append_maxent_parameter(context, i, prefix_syllable_name[length]) i = self.append_maxent_parameter(context, i, suffix_syllable_name[length]) if merged_use: i = self.append_maxent_parameter(context, i, merged_syllable_name[length]) i = self.append_maxent_parameter(context, i, current_token_name) i = self.append_maxent_parameter(context, i, prefix_token_name) i = self.append_maxent_parameter(context, i, suffix_token_name) i = self.append_maxent_parameter(context, i, str(current_token_length)) i = self.append_maxent_parameter(context, i, str(prefix_token_length)) i = self.append_maxent_parameter(context, i, str(suffix_token_length)) eos = self.eos(context) return eos def calc(self, answer, rule): if answer == True and rule == True: result = 'TP' elif answer == True and rule == False: result = 'TN' elif answer == False and rule == True: result = 'FP' else: result = 'FN' self.statistics[result] += 1 def summary(self): precision = 0.0 recall = 0.0 fscore = 0.0 tp = self.statistics['TP'] tn = self.statistics['TN'] fp = self.statistics['FP'] util.Logger.info("tp:", tp, "tn:", tn, "fp:", fp) if (tp + tn) > 0: precision = tp * 1.0 / (tp + tn) if (tp + fp) > 0: recall = tp * 1.0 / (tp + fp) if (precision+recall) > 0: fscore = (2*precision*recall) / (precision+recall) util.Logger.info("Precision:\t%0.3f%%" % (precision * 100.0)) util.Logger.info("Recall:\t\t%0.3f%%" % (recall * 100.0)) util.Logger.info("Fscore:\t\t%0.3f%%" % (fscore * 100.0))
#!/usr/bin/env python2 # -*- coding: utf-8 -*- # Imports import sys, os # Load MaxEnt models corpusPath = os.environ.get('CORPUS_PATH') from maxent import MaxentModel maxEntModel = MaxentModel() maxEntModel.load(corpusPath + '/model_markers.txt') for trainLine in sys.stdin.readlines(): trainCols = trainLine.split('\t') modelMarkerProbas = maxEntModel.eval_all(trainCols[1:]) probaFeats = [] for modelMarkerProba in modelMarkerProbas: if modelMarkerProba[1] > 0.00001: probaFeats.append(modelMarkerProba[0] + ':' + str(modelMarkerProba[1])) print trainCols[0] + '\t' + '\t'.join(probaFeats)
'b': self._dummy, 'm': self._dummy} return action[tag] def segment(self): """ sent must be utf8 decoded. """ if not self.sentence: return '' ts = heappop(self._segmentationTag()).segtags return ' '.join(self._get_words(ts)) _newword = lambda self, result, nextword: (result+[nextword], '') _dummy = lambda self, result, nextword: (result, nextword) def test_tagger(inputfile, model, segmenterClass): for line in open(inputfile): line = line.decode('utf8').strip() segmenter = segmenterClass(line, model) print segmenter.segment().encode('utf8') if __name__ == "__main__": import sys if len(sys.argv) != 3: print 'Usage: segmenter.py modelfile inputfile' sys.exit() from maxent import MaxentModel model = MaxentModel() model.load(sys.argv[1]) test_tagger(sys.argv[2], model, segmenterClass=WordSegmenter)
#!/usr/bin/python2.5 from maxent import MaxentModel import sys model_file = sys.argv[1] m = MaxentModel() m.load(model_file) m.save(model_file+'.txt')
#!/usr/bin/env python2 # -*- coding: utf-8 -*- # Imports import sys, os # Load MaxEnt models corpusPath = os.environ.get('CORPUS_PATH') from maxent import MaxentModel maxEntModel = MaxentModel() maxEntModel.load(corpusPath+'/model_markers.txt') for trainLine in sys.stdin.readlines(): trainCols = trainLine.split('\t') modelMarkerProbas = maxEntModel.eval_all(trainCols[1:]) probaFeats = [] for modelMarkerProba in modelMarkerProbas: if modelMarkerProba[1] > 0.00001: probaFeats.append(modelMarkerProba[0] + ':' + str(modelMarkerProba[1])) print trainCols[0] + '\t' + '\t'.join(probaFeats)