from sklearn.feature_extraction.text import CountVectorizer from preprocess import Preprocessor from sklearn.externals import joblib def custom_preprocessor(str): # Do not perform any preprocessing here. return str def custom_tokenizer(str): # Text must be segmented and separated each word by a space. return str.split(' ') # intialize transformers count_vect = CountVectorizer(analyzer = 'word',tokenizer=custom_tokenizer,preprocessor=custom_preprocessor) # load all text prep = Preprocessor() prep.load(0) # fit transformers print "[Transformer]: transform all text to global CountVectorizer" texts = prep.get_all_text() count_vect.fit_transform([text[1] for text in texts]) # export transformer joblib.dump(count_vect, "core_model/count_vectorizer.model") print "[Transformer]: saved"
class model_trainer: def __init__(self): # read model/transformer from file self.model_file_dir = "core_model" self.count_vect_file_name = "core_model/count_vectorizer.model" if not os.path.isfile(self.count_vect_file_name): print "Build transformer first! (python build_text_transformer.py)" sys.exit() self.count_vect = joblib.load(self.count_vect_file_name) self.prep = Preprocessor() self.model_major = [ ('SVM', BaggingClassifier(SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42), max_samples=0.5, max_features=0.5)), ('NB', BaggingClassifier(MultinomialNB(alpha=.01), max_samples=0.5, max_features=0.5)), # ('ANN' , BaggingClassifier(MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1),max_samples=0.5, max_features=0.5)), # ('KNN' , BaggingClassifier(KNeighborsClassifier(n_neighbors=10),max_samples=0.5, max_features=0.5)), ('RDFOREST', RandomForestClassifier(n_estimators=25)), ('NC', BaggingClassifier(NearestCentroid(), max_samples=0.5, max_features=0.5)), ('ADA-SAMME.R', AdaBoostClassifier(n_estimators=100)), ] self.models = { 'SVM': SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42), 'NB': MultinomialNB(alpha=.01), # 'ANN' : MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1), # 'KNN' : KNeighborsClassifier(n_neighbors=10), 'RDFOREST': RandomForestClassifier(n_estimators=25), 'NC': NearestCentroid(), # 'MAJOR' : VotingClassifier(estimators=self.model_major,voting='soft',n_jobs=-1) } def load_data(self, size=0): self.prep.load(size) def train(self, X, y, count_vect, clf, partial=False): X_count = count_vect.transform(X) X_tfidf = TfidfTransformer().fit_transform(X_count) if partial: clf.partial_fit(X_tfidf, y) else: clf.fit(X_tfidf, y) def predict(self, X, count_vect, clf): X_count = count_vect.transform(X) X_tfidf = TfidfTransformer().fit_transform(X_count) return clf.predict(X_tfidf) def train_all_tag(self): train_tag_list = [] heightest_score = defaultdict(float) heightest_info = {} heightest_model = {} all_tag_idx = self.prep.get_all_tag_idx() for target_tag in all_tag_idx: for model_name in self.models: X_train, y_train, X_test, y_test = self.prep.get_train_test_data_tag( target_tag) if len(y_train) < 200: print "[Train] not enough (%3d less than 100) sample for '%s'" % ( len(y_train), self.prep.get_target_names() [target_tag].encode('utf-8')) continue # use only content from (paragraph_id,content) X_train = [data[1] for data in X_train] X_test = [data[1] for data in X_test] train_tag_list.append(target_tag) print "train %s" % target_tag self.train(X_train, y_train, self.count_vect, self.models[model_name], False) predicted = self.predict(X_test, self.count_vect, self.models[model_name]) score = np.mean(predicted == y_test) matrix = metrics.precision_recall_fscore_support( y_test, predicted, average='binary', pos_label=target_tag) precision = matrix[0] recall = matrix[1] f1 = matrix[2] print "%s score %.2f (%s)-[%d/%d]" % ( model_name, score, self.prep.get_target_names()[target_tag].encode('utf-8'), len(y_train), len(y_test)) print "precision=%.2f, recall=%.2f, *** f1=%.2f ***" % ( precision, recall, f1) print if heightest_score[target_tag] < f1: heightest_score[target_tag] = f1 heightest_info[target_tag] = (f1, precision, recall, score, model_name) heightest_model[target_tag] = deepcopy( self.models[model_name]) db = DB() db.clear_model_score() for tag in set(train_tag_list): if not tag in heightest_info: continue output_filename = os.path.join(self.model_file_dir, "%s.model" % tag) print "[Train] save model '%s' (%s) with F1=%.2f, precision=%.2f, recall=%.2f, score=%.2f : %s" % ( output_filename, self.prep.get_target_names()[tag].encode('utf-8'), heightest_info[tag][0], heightest_info[tag][1], heightest_info[tag][2], heightest_info[tag][3], heightest_info[tag][4]) joblib.dump(heightest_model[tag], output_filename) db.add_model_info(tag, output_filename, heightest_score[tag]) return heightest_info
def main(args): model = get_model(args) if args.task == 'validate': X_Train = load_csv(args.train_X) T_Train = load_csv(args.train_T).flatten() X_Train_phi, phi = preprocess(args, X_Train, T_Train) logging.info('Training') model.validate(X_Train_phi, T_Train, params=get_param_validate(args)) elif args.task == 'train': X_Train = load_csv(args.train_X) T_Train = load_csv(args.train_T).flatten() X_Train_phi, phi = preprocess(args, X_Train, T_Train) inds = range(len(X_Train)) np.random.shuffle(inds) X_Train_phi = X_Train_phi[inds] T_Train = T_Train[inds] logging.info('Training') model.train(X_Train_phi, T_Train, param=get_param(args)) train_acc = model.eval(X_Train_phi, T_Train) logging.info('Training Accuracy = %f' % train_acc) if args.test_X != None and args.test_T != None: X_Test = load_csv(args.test_X) T_Test = load_csv(args.test_T).flatten() X_Test_phi = phi.transform(X_Test) test_acc = model.eval(X_Test_phi, T_Test) logging.info('Testing Accuracy = %f' % test_acc) print(test_acc) if args.save != None: model.save('%s' % args.save) logging.info('Model saved at %s' % args.save) phi.save('%s' % args.save + '_phi') logging.info('Model preprocessor saved at %s' % args.save + '_phi') elif args.task == 'plot': model.load(args.load) logging.info('Model loaded from %s' % args.load) logging.info('Plotting') l_tree = model.model.estimators_ plot_decision_tree(l_tree, args.dot) elif args.task == 'dt_eval': phi = Preprocessor() phi.load(args.load + '_phi') X_Test = load_csv(args.test_X) T_Test = load_csv(args.test_T).flatten() X_Test_phi = phi.transform(X_Test) model.load(args.load) logging.info('Model loaded from %s' % args.load) logging.info('Decision Tree Evaluating') l_tree = model.model.estimators_ for tree in l_tree: test_acc = tree.score(X_Test_phi, T_Test) print('%f' % test_acc)