class TextPredictor(object): def __init__(self, model_path): self.identifer = libtrate.DoubleIdentifer() self.identifer.Load(model_path + '/identifer.bin') self.predictor = Predictor() self.predictor.load(model_path) self.normalizer = libnormalizer.NormalizerFactory().Load(model_path + '/normalizer.bin') from libcalibrator import CalibratorFactory self.calibrator = CalibratorFactory.Load(model_path + '/calibrator.bin') Segmentor.Init() def deal_title(self, title): return title + ' ' + libgezi.normalize_str(title) def deal_content(self, content): content = libgezi.strip_html(content) if len(content) > 100: content = libgezi.gbk_substr(content, 0, 100) content = content + ' ' + libgezi.normalize_str(content) return content def predict(self, title, content): Segmentor.Init() title = self.deal_title(title) content = self.deal_content(content) title_words = Segmentor.Segment(title, SEG_BASIC) content_words = Segmentor.Segment(content, SEG_BASIC) id_val_map = libtrate.id_map() num_words = self.identifer.size() libtrate.TextPredictor.Prase(title_words, id_val_map, self.identifer, 0, ngram = 3, skip = 2) libtrate.TextPredictor.Prase(content_words, id_val_map, self.identifer, num_words, ngram = 3, skip = 2) fe = libtrate.Vector(id_val_map) normed_fe = self.normalizer.NormalizeCopy(fe) score = float(self.predictor.Predict(normed_fe)) adjusted_score = self.calibrator.PredictProbability(score) return adjusted_score
def __init__(self, model_path): self.identifer = libtrate.DoubleIdentifer() self.identifer.Load(model_path + '/identifer.bin') self.predictor = Predictor() self.predictor.load(model_path) self.normalizer = libnormalizer.NormalizerFactory().Load(model_path + '/normalizer.bin') from libcalibrator import CalibratorFactory self.calibrator = CalibratorFactory.Load(model_path + '/calibrator.bin') Segmentor.Init()
def main(): for solver in SOLVERS: for loss_type in LOSS_FUNCS: for h in HIDDEN_SIZES: print('Hidden sizes = {}'.format(h)) hidden_sizes = h for lr in LEARNING_RATES: ds = DataSampler() arch = MODEL + '_' + 'x'.join( [str(i) for i in hidden_sizes]) + 'x{}'.format(lr) task = '_'.join([arch, loss_type, solver]) print('[TRAIN] Start experiment: {}'.format(task)) classifier = BinaryClassifier(data_sampler=ds, task_name=task, hidden_sizes=hidden_sizes, model=MODEL, solver_type=solver, activation='relu', loss_func=loss_type, learning_rate=lr) if (FLAGS.sanity_check): classifier.overfit_test() else: classifier.train() # reset the model tf.reset_default_graph() print('[TRAIN] Done {}, reset network.'.format(task))
def __init__(self, model_path, predictor_path): self.identifer = libtrate.DoubleIdentifer() self.identifer.Load(model_path + '/identifer.bin') #self.predictor = libtrate.LinearPredictor() #self.predictor.Load(predictor_path) self.predictor = libtrate.PredictorFactory.LoadPredictor( predictor_path) self.dnn_predictor = DnnPredictor() self.dnn_predictor.load(model_path) self.normalizer = None if self.predictor == None: self.normalizer = libnormalizer.NormalizerFactory().Load( './normalizer.bin') else: self.normalizer = self.predictor.GetNormalizer() from libcalibrator import CalibratorFactory self.calibrator = CalibratorFactory.Load(model_path + '/calibrator.bin') Segmentor.Init()
#identifer.Load('./data/ltrate.thread.model/identifer.bin') identifer.Load('./ltrate.thread.model/identifer.bin') print identifer.size() print identifer.id('工程') #normalizer = libtrate.NormalizerFactory.CreateNormalizer('minmax', './data/ltrate.thread.model/normalizer.bin') #normalizer = libnormalizer.NormalizerFactory.Load('./data/ltrate.thread.model/normalizer.bin') normalizer = libnormalizer.NormalizerFactory.Load('./ltrate.thread.model/normalizer.bin') #lpredictor = libtrate.PredictorFactory.LoadPredictor('./data/ltrate.thread.model/') lpredictor = libtrate.PredictorFactory.LoadPredictor('./ltrate.thread.model/') from libcalibrator import CalibratorFactory calibrator = CalibratorFactory.Load('./model/calibrator.bin') bc = BinaryClassifier() bc.load('./model') print type(normalizer) import libgezi def deal_content(content): content = libgezi.strip_html(content) if len(content) > 100: content = libgezi.gbk_substr(content, 0, 100) content = content + ' ' + libgezi.normalize_str(content) return content def deal_title(title): return title + ' ' + libgezi.normalize_str(title)
#!/usr/bin/env python #coding=gbk # ============================================================================== # \file test_bc_load.py # \author chenghuige # \date 2015-12-29 13:47:35.353005 # \Description # ============================================================================== import sys, os from binary_classification import BinaryClassifier bc = BinaryClassifier() bc.load('./model')
class TextPredictor(object): def __init__(self, model_path, predictor_path): self.identifer = libtrate.DoubleIdentifer() self.identifer.Load(model_path + '/identifer.bin') #self.predictor = libtrate.LinearPredictor() #self.predictor.Load(predictor_path) self.predictor = libtrate.PredictorFactory.LoadPredictor( predictor_path) self.dnn_predictor = DnnPredictor() self.dnn_predictor.load(model_path) self.normalizer = None if self.predictor == None: self.normalizer = libnormalizer.NormalizerFactory().Load( './normalizer.bin') else: self.normalizer = self.predictor.GetNormalizer() from libcalibrator import CalibratorFactory self.calibrator = CalibratorFactory.Load(model_path + '/calibrator.bin') Segmentor.Init() def deal_title(self, title): return title + ' ' + libgezi.normalize_str(title) def deal_content(self, content): content = libgezi.strip_html(content) if len(content) > 100: content = libgezi.gbk_substr(content, 0, 100) content = content + ' ' + libgezi.normalize_str(content) return content def predict(self, title, content): Segmentor.Init() title = self.deal_title(title) content = self.deal_content(content) #return libtrate.TextPredictor.Predict(title, content, self.identifer, self.predictor) title_words = Segmentor.Segment(title, SEG_BASIC) content_words = Segmentor.Segment(content, SEG_BASIC) #return libtrate.TextPredictor.Predict(title_words, content_words, self.identifer, self.predictor #score1 = libtrate.TextPredictor.Predict(title_words, content_words, self.identifer, self.predictor) id_val_map = libtrate.id_map() num_words = self.identifer.size() libtrate.TextPredictor.Prase(title_words, id_val_map, self.identifer, 0) libtrate.TextPredictor.Prase(content_words, id_val_map, self.identifer, num_words) #print id_val_map.size() fe = libtrate.Vector(id_val_map) #print fe.indices.size() score = self.predictor.Predict(fe) #print 'begin dnn predict' #fe = self.normalizer.NormalizeCopy(fe) dnn_score = float(self.dnn_predictor.Predict(fe)) adjusted_dnn_score = self.calibrator.PredictProbability(dnn_score) #print score1, ' ', score2, ' ', score3 #print 'linear: ',score1,' ','dnn: ', score3, 'adjusted: ', self.calibrator.PredictProbability(score3) #return self.predictor.Predict(fe) return (score, dnn_score, adjusted_dnn_score) def id2key_map(self, id_val_map): fe = libtrate.Vector(id_val_map) self.predictor.GetNormalizer().Normalize(fe) key_val_map = collections.defaultdict(float) weights = self.predictor.weights() total = 0 for i in xrange(fe.indices.size()): index = fe.indices[i] val = fe.values[i] if weights[index] == 0: continue key = '' if index < self.identifer.size(): key = 't:' + self.identifer.key(index) else: key = 'c:' + self.identifer.key(index - self.identifer.size()) key_val_map[key] += weights[index] total += val * weights[index] total += self.predictor.bias() return key_val_map, total def predict_debug(self, title, content): id_val_map = libtrate.id_map() title = self.deal_title(title) content = self.deal_content(content) #score = libtrate.TextPredictor.Predict(title, content, self.identifer, self.predictor, id_val_map) title_words = Segmentor.Segment(title, SEG_BASIC) sep = '/' import gezi title_seg_result = sep.join(gezi.vec2list(title_words)) content_words = Segmentor.Segment(content, SEG_BASIC) content_seg_result = sep.join(gezi.vec2list(content_words)) score = libtrate.TextPredictor.Predict(title_words, content_words, self.identifer, self.predictor, id_val_map) key_val_map, total = self.id2key_map(id_val_map) import cStringIO map_info = cStringIO.StringIO() map_info.write('Per ngram weight sort by spam prob\n\n') sorted_items = sorted(key_val_map.items(), key=lambda x: x[1], reverse=True) for key, val in sorted_items: map_info.write('{}|{:40f}\n'.format(key.replace('\x01', '/'), val)) map_info.write('\n\nPer ngram weight sort by impportance\n\n') sorted_items = sorted(key_val_map.items(), key=lambda x: abs(x[1]), reverse=True) for key, val in sorted_items: map_info.write('{}|{:40f}\n'.format(key.replace('\x01', '/'), val)) class DebugInfo(object): title_seg_result = '' content_seg_result = '' debug_info = DebugInfo() debug_info.title_seg_result = title_seg_result debug_info.content_seg_result = content_seg_result debug_info.map_info = map_info.getvalue() debug_info.title = title debug_info.content = content debug_info.total = total debug_info.output = self.predictor.Output(id_val_map) debug_info.score = self.predictor.Predict(id_val_map) return score, debug_info