def baseline(sentences, labels): maxent.set_verbose(1) m = MaxentModel() m.begin_add_event() with open(sentences) as file_content: sentences = file_content.readlines() with open(labels) as file_content: labels = file_content.readlines() for i in xrange(0, 3000): m.add_event(sentences[i].split(" "), labels[i].strip()) m.end_add_event() m.train() correct = 0 false = 0 for i in xrange(3000, len(sentences)): result = m.eval(sentences[i].split(" "), "1") result = int(round(result)) label = int(labels[i]) if result == label: correct = correct + 1 else: false = false + 1 print "correct :", correct print "false :", false print("accuracy : {:.2f}%".format(correct * 100.0 / (correct + false)))
class MaximumEntropyClassifier(Classifier): def __init__(self, restrictFeatures=False): Classifier.__init__(self) print "MaximumEntropy: Creating model" self.model = MaxentModel() self.model.verbose = 1 self.restrictFeatures = restrictFeatures self.model.begin_add_event() def addToIndex(self, trainingset): for (vec,cls) in trainingset: self.addFeatureVector(vec,cls) def addFeatureVector(self, vec, cls, value=1, binary=False): for key in vec.keys(): if key not in self.restrictFeatures: del vec[key] context = vec.keys() label = "%s" % cls self.model.add_event(context,label,value) def compile(self): self.model.end_add_event() self.model.train(30, "lbfgs", 2, 1E-03) #self.model.train(100, 'gis', 2) print "> Models trained" def classify(self, point, label='1', binary=False): result = self.model.eval(point.keys(), label) if result >= 0.5: return 1 return -1
class MaximumEntropyClassifier(Classifier): def __init__(self, restrictFeatures=False): Classifier.__init__(self) print "MaximumEntropy: Creating model" self.model = MaxentModel() self.model.verbose = 1 self.restrictFeatures = restrictFeatures self.model.begin_add_event() def addToIndex(self, trainingset): for (vec, cls) in trainingset: self.addFeatureVector(vec, cls) def addFeatureVector(self, vec, cls, value=1, binary=False): for key in vec.keys(): if key not in self.restrictFeatures: del vec[key] context = vec.keys() label = "%s" % cls self.model.add_event(context, label, value) def compile(self): self.model.end_add_event() self.model.train(30, "lbfgs", 2, 1E-03) #self.model.train(100, 'gis', 2) print "> Models trained" def classify(self, point, label='1', binary=False): result = self.model.eval(point.keys(), label) if result >= 0.5: return 1 return -1
def main(): if len(sys.argv) != 2: print "Usage: MaxentTest.py modelName" sys.exit(1) model = MaxentModel() model.load(sys.argv[1]) context = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] label = model.eval(context, str(0)) #result = model.eval_all(context) print "Result: ", label
def main(): if len(sys.argv) != 2: print "Usage: MaxentTest.py modelName" sys.exit(1) model = MaxentModel() model.load(sys.argv[1]) context = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] label = model.eval(context, str(0)) #result = model.eval_all(context) print "Result: ", label
def predict_tags(best_1_name, best_1_org, best_3_name, best_5_org, sentences, f, op): rel = [ 'others', 'director', 'analyst', 'advisor', 'head', 'manager', 'spokesperson', 'founder', 'professor', 'leave', 'lawyer' ] me = MaxentModel() me.load('../training/models/lbfgs/model3') count = 0 for n1, o1, n3, o5, sent in zip(best_1_name, best_1_org, best_3_name, best_5_org, sentences): if len(n3) == 0 or len(o5) == 0: op.write(str((n1, o1)) + '\n') else: j = ('', '', '', 0.0) d = {} for name in n3: for org in o5: context = get_context(name, org, sent) relation = '' prob = 0.0 if context != None: for r in rel: y = me.eval(context, r) if y > prob: prob = y relation = r #set_r.append((name,org,relation,prob)) d[(name, org)] = relation if prob > j[3] and relation != 'others': j = (name, org, relation, prob) else: d[(name, org)] = 'others' #print str(count)+' before : '+str(n1)+'\t'+str(o1) resolve_conflicts(n1, o1, j) #print str(count)+' after : '+str(n1)+'\t'+str(o1) #x = raw_input() op.write(str((n1, o1)) + '\n') f.write(str(j) + '\n') count = count + 1
class BatchSBD: def __init__(self, dictpath): util.Logger.info('Initializing sbd instance...') self.tokenizer = Tokenizer.Tokenizer() self.statistics = defaultdict(int) self.dictionary = Dictionary.Dictionary(dictpath) self.dictionary.load('syllable') self.dictionary.load('token') self.dictionary.load('type') self.dictionary.load('length') self.model = MaxentModel() self.threshold = 0.0 util.Logger.info('sbd instance Initialized.') def load(self, modelname=None, threshold=0.0): util.Logger.info('Loading model...') assert(modelname != None) assert(modelname.strip() != '') try: util.Logger.debug("Started to load model...") self.model.load(modelname) self.threshold = threshold util.Logger.debug("Completed to load model '%s'" % modelname) except: raise util.Logger.info('Model loaded.') def run(self, input=None, output=None, syllable_length=1, merged_use=False): util.Logger.info('run ' + input + ',' + output) assert(input != None) assert(input.strip() != '') assert(output != None) assert(output.strip() != '') try: # load document util.Logger.info("Started to load document.") document = Document.Document() ifile = open(input) # build document util.Logger.info("Adding token to document.") self.tokenizer.clear() for token in self.tokenizer.tokenize(ifile): document.add(token) ifile.close() # detect sentence boundaries util.Logger.info("Detecting sentence boundaries.") ofile = open(output, "w+") line = '' lineno = 1 for id in range(document.length()): prev = document.prev(id) curr = document.token(id) next = document.next(id) eos = False # check every position eos = self.eval(document, id, prev, curr, next, syllable_length, merged_use) if eos == None: continue; # null field found line += curr.value if curr.isEoe(): line += ' ' if eos and len(line.strip()) > 0: if line[0:1] == ' ': ofile.write('\n') ofile.write(line.strip() + '\n') line = '' ofile.write(line.strip() + '\n') ofile.close() document.clear() util.Logger.info("Detecting '%s' document completed." % input) except: raise def eos(self, context): label = 'yes' prob = self.model.eval(context, label) buf = '' if prob >= self.threshold: return True else: return False # append property into list-buf def append_maxent_parameter(self, list, i, property): i += 1 list.append(str(i) + ':' + str(property)) return i # FIXME: code duplicattion with sbd.detector.Probabilistic.py def eval(self, document, id, prevToken, currToken, nextToken, syllable_length=0, merged_use=False): dict = self.dictionary common = util.Common() # default token value default = '_' # { pos-type, pos-name } current_pos_type = common.name_of_type(currToken) current_pos_name = common.name_of_pos(currToken) prefix_pos_type = common.name_of_type(prevToken) prefix_pos_name = common.name_of_pos(prevToken) suffix_pos_type = common.name_of_type(nextToken) suffix_pos_name = common.name_of_pos(nextToken) # { syllables } prefix_syllable_name = [] prefix_syllable_prob = [] suffix_syllable_name = [] suffix_syllable_prob = [] merged_syllable_name = [] merged_syllable_prob = [] for length in xrange(syllable_length): if prevToken.length == 0: prefixName = default * syllable_length else: prefixName = prevToken.syllable(-1*(length+1)) prefix_syllable_name.append(prefixName) prefix_syllable_prob.append(dict.getPrefixSyllableProb(prefixName)) if nextToken.length == 0: suffixName = default * syllable_length else: suffixName = nextToken.syllable(length+1) suffix_syllable_name.append(suffixName) suffix_syllable_prob.append(dict.getSuffixSyllableProb(suffixName)) if merged_use: mergedName = prefixName + '_' + suffixName merged_syllable_name.append(mergedName) merged_syllable_prob.append(dict.getMergedSyllableProb(mergedName)) # { token-name, token-prob } if currToken.length == 0: current_token_name = default else: current_token_name = currToken.value current_token_prob = dict.getCurrentTokenProb(current_token_name) if prevToken.length == 0: prefix_token_name = default else: prefix_token_name = prevToken.value prefix_token_prob = dict.getPrefixTokenProb(prefix_token_name) if nextToken.length == 0: suffix_token_name = default else: suffix_token_name = nextToken.value suffix_token_prob = dict.getSuffixTokenProb(suffix_token_name) # { candidate-distance } prefix_candidate_dist = document.prevCandidateDist(id) suffix_candidate_dist = document.nextCandidateDist(id) # { punctuation-distance } prefix_punctuation_dist = document.prevPunctuationDist(id) suffix_punctuation_dist = document.nextPunctuationDist(id) # { token-length } current_token_length = currToken.length prefix_token_length = prevToken.length suffix_token_length = nextToken.length # { end-of-sentence } end_of_sentence = 'no' if currToken.end_of_sentence: end_of_sentence = 'yes' context = [end_of_sentence] i = 0 # { building instances } i = self.append_maxent_parameter(context, i, current_pos_type) i = self.append_maxent_parameter(context, i, current_pos_name) i = self.append_maxent_parameter(context, i, prefix_pos_type) i = self.append_maxent_parameter(context, i, prefix_pos_name) i = self.append_maxent_parameter(context, i, suffix_pos_type) i = self.append_maxent_parameter(context, i, suffix_pos_name) # XXX: maxent use NAME instead of PROBABILITY for length in xrange(syllable_length): i = self.append_maxent_parameter(context, i, prefix_syllable_name[length]) i = self.append_maxent_parameter(context, i, suffix_syllable_name[length]) if merged_use: i = self.append_maxent_parameter(context, i, merged_syllable_name[length]) i = self.append_maxent_parameter(context, i, current_token_name) i = self.append_maxent_parameter(context, i, prefix_token_name) i = self.append_maxent_parameter(context, i, suffix_token_name) i = self.append_maxent_parameter(context, i, str(current_token_length)) i = self.append_maxent_parameter(context, i, str(prefix_token_length)) i = self.append_maxent_parameter(context, i, str(suffix_token_length)) eos = self.eos(context) return eos def calc(self, answer, rule): if answer == True and rule == True: result = 'TP' elif answer == True and rule == False: result = 'TN' elif answer == False and rule == True: result = 'FP' else: result = 'FN' self.statistics[result] += 1 def summary(self): precision = 0.0 recall = 0.0 fscore = 0.0 tp = self.statistics['TP'] tn = self.statistics['TN'] fp = self.statistics['FP'] util.Logger.info("tp:", tp, "tn:", tn, "fp:", fp) if (tp + tn) > 0: precision = tp * 1.0 / (tp + tn) if (tp + fp) > 0: recall = tp * 1.0 / (tp + fp) if (precision+recall) > 0: fscore = (2*precision*recall) / (precision+recall) util.Logger.info("Precision:\t%0.3f%%" % (precision * 100.0)) util.Logger.info("Recall:\t\t%0.3f%%" % (recall * 100.0)) util.Logger.info("Fscore:\t\t%0.3f%%" % (fscore * 100.0))
class MaxentBasedSBD: def __init__(self, dictpath): self.tokenizer = Tokenizer.Tokenizer() self.documents = defaultdict(Document.Document) self.statistics = defaultdict(int) self.dictionary = Dictionary.Dictionary(dictpath) self.dictionary.load('syllable') self.dictionary.load('token') self.dictionary.load('type') self.dictionary.load('length') self.model = MaxentModel() self.threshold = 0.0 def set(self, modelname=None, threshold=0.0, filename=None): assert(modelname != None) assert(modelname.strip() != '') assert(filename != None) assert(filename.strip() != '') try: util.Logger.debug("Started to load model...") self.model.load(modelname) self.threshold = threshold util.Logger.debug("Completed to load model '%s'" % modelname) except: raise try: util.Logger.debug("Started to load document...") document = Document.Document() file = open(filename) for token in self.tokenizer.tokenize(file): document.add(token) file.close() self.documents[filename] = document util.Logger.debug("Competed to load document '%s'" % filename) except: raise def get(self, filename=None): assert(filename != None) assert(filename.strip() != '') if filename in self.documents: return self.documents[filename] else: return Document.Document() def eos(self, context): label = 'yes' prob = self.model.eval(context, label) buf = '' if prob >= self.threshold: return True else: return False # append property into list-buf def append_maxent_parameter(self, list, i, property): i += 1 list.append(str(i) + ':' + str(property)) return i # FIXME: code duplicattion with sbd.detector.Probabilistic.py def eval(self, document, id, prevToken, currToken, nextToken, syllable_length=0, merged_use=False): dict = self.dictionary common = util.Common() # default token value default = '_' # { pos-type, pos-name } current_pos_type = common.name_of_type(currToken) current_pos_name = common.name_of_pos(currToken) prefix_pos_type = common.name_of_type(prevToken) prefix_pos_name = common.name_of_pos(prevToken) suffix_pos_type = common.name_of_type(nextToken) suffix_pos_name = common.name_of_pos(nextToken) # { syllables } prefix_syllable_name = [] prefix_syllable_prob = [] suffix_syllable_name = [] suffix_syllable_prob = [] merged_syllable_name = [] merged_syllable_prob = [] for length in xrange(syllable_length): if prevToken.length == 0: prefixName = default * syllable_length else: prefixName = prevToken.syllable(-1*(length+1)) prefix_syllable_name.append(prefixName) prefix_syllable_prob.append(dict.getPrefixSyllableProb(prefixName)) if nextToken.length == 0: suffixName = default * syllable_length else: suffixName = nextToken.syllable(length+1) suffix_syllable_name.append(suffixName) suffix_syllable_prob.append(dict.getSuffixSyllableProb(suffixName)) if merged_use: mergedName = prefixName + '_' + suffixName merged_syllable_name.append(mergedName) merged_syllable_prob.append(dict.getMergedSyllableProb(mergedName)) # { token-name, token-prob } if currToken.length == 0: current_token_name = default else: current_token_name = currToken.value current_token_prob = dict.getCurrentTokenProb(current_token_name) if prevToken.length == 0: prefix_token_name = default else: prefix_token_name = prevToken.value prefix_token_prob = dict.getPrefixTokenProb(prefix_token_name) if nextToken.length == 0: suffix_token_name = default else: suffix_token_name = nextToken.value suffix_token_prob = dict.getSuffixTokenProb(suffix_token_name) # { candidate-distance } prefix_candidate_dist = document.prevCandidateDist(id) suffix_candidate_dist = document.nextCandidateDist(id) # { punctuation-distance } prefix_punctuation_dist = document.prevPunctuationDist(id) suffix_punctuation_dist = document.nextPunctuationDist(id) # { token-length } current_token_length = currToken.length prefix_token_length = prevToken.length suffix_token_length = nextToken.length # { end-of-sentence } end_of_sentence = 'no' if currToken.end_of_sentence: end_of_sentence = 'yes' context = [end_of_sentence] i = 0 # { building instances } i = self.append_maxent_parameter(context, i, current_pos_type) i = self.append_maxent_parameter(context, i, current_pos_name) i = self.append_maxent_parameter(context, i, prefix_pos_type) i = self.append_maxent_parameter(context, i, prefix_pos_name) i = self.append_maxent_parameter(context, i, suffix_pos_type) i = self.append_maxent_parameter(context, i, suffix_pos_name) # XXX: maxent use NAME instead of PROBABILITY for length in xrange(syllable_length): i = self.append_maxent_parameter(context, i, prefix_syllable_name[length]) i = self.append_maxent_parameter(context, i, suffix_syllable_name[length]) if merged_use: i = self.append_maxent_parameter(context, i, merged_syllable_name[length]) i = self.append_maxent_parameter(context, i, current_token_name) i = self.append_maxent_parameter(context, i, prefix_token_name) i = self.append_maxent_parameter(context, i, suffix_token_name) i = self.append_maxent_parameter(context, i, str(current_token_length)) i = self.append_maxent_parameter(context, i, str(prefix_token_length)) i = self.append_maxent_parameter(context, i, str(suffix_token_length)) eos = self.eos(context) return eos def calc(self, answer, rule): if answer == True and rule == True: result = 'TP' elif answer == True and rule == False: result = 'TN' elif answer == False and rule == True: result = 'FP' else: result = 'FN' self.statistics[result] += 1 def summary(self): precision = 0.0 recall = 0.0 fscore = 0.0 tp = self.statistics['TP'] tn = self.statistics['TN'] fp = self.statistics['FP'] util.Logger.info("tp:", tp, "tn:", tn, "fp:", fp) if (tp + tn) > 0: precision = tp * 1.0 / (tp + tn) if (tp + fp) > 0: recall = tp * 1.0 / (tp + fp) if (precision+recall) > 0: fscore = (2*precision*recall) / (precision+recall) util.Logger.info("Precision:\t%0.3f%%" % (precision * 100.0)) util.Logger.info("Recall:\t\t%0.3f%%" % (recall * 100.0)) util.Logger.info("Fscore:\t\t%0.3f%%" % (fscore * 100.0))