from sklearn.feature_extraction.text import CountVectorizer
from preprocess import Preprocessor
from sklearn.externals import joblib

def custom_preprocessor(str):
    # Do not perform any preprocessing here.
	return str
	
def custom_tokenizer(str):
    # Text must be segmented and separated each word by a space.
	return str.split(' ')

# intialize transformers    
count_vect = CountVectorizer(analyzer = 'word',tokenizer=custom_tokenizer,preprocessor=custom_preprocessor)

# load all text
prep = Preprocessor()
prep.load(0)

# fit transformers
print "[Transformer]: transform all text to global CountVectorizer"
texts = prep.get_all_text()
count_vect.fit_transform([text[1] for text in texts])

# export transformer
joblib.dump(count_vect, "core_model/count_vectorizer.model")
print "[Transformer]: saved"
class model_trainer:
    def __init__(self):
        # read model/transformer from file
        self.model_file_dir = "core_model"
        self.count_vect_file_name = "core_model/count_vectorizer.model"

        if not os.path.isfile(self.count_vect_file_name):
            print "Build transformer first! (python build_text_transformer.py)"
            sys.exit()

        self.count_vect = joblib.load(self.count_vect_file_name)
        self.prep = Preprocessor()

        self.model_major = [
            ('SVM',
             BaggingClassifier(SGDClassifier(loss='hinge',
                                             penalty='l2',
                                             alpha=1e-3,
                                             n_iter=5,
                                             random_state=42),
                               max_samples=0.5,
                               max_features=0.5)),
            ('NB',
             BaggingClassifier(MultinomialNB(alpha=.01),
                               max_samples=0.5,
                               max_features=0.5)),
            # ('ANN' ,  BaggingClassifier(MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1),max_samples=0.5, max_features=0.5)),
            # ('KNN' ,  BaggingClassifier(KNeighborsClassifier(n_neighbors=10),max_samples=0.5, max_features=0.5)),
            ('RDFOREST', RandomForestClassifier(n_estimators=25)),
            ('NC',
             BaggingClassifier(NearestCentroid(),
                               max_samples=0.5,
                               max_features=0.5)),
            ('ADA-SAMME.R', AdaBoostClassifier(n_estimators=100)),
        ]

        self.models = {
            'SVM':
            SGDClassifier(loss='hinge',
                          penalty='l2',
                          alpha=1e-3,
                          n_iter=5,
                          random_state=42),
            'NB':
            MultinomialNB(alpha=.01),
            # 'ANN' : MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1),
            # 'KNN' : KNeighborsClassifier(n_neighbors=10),
            'RDFOREST':
            RandomForestClassifier(n_estimators=25),
            'NC':
            NearestCentroid(),
            # 'MAJOR' : VotingClassifier(estimators=self.model_major,voting='soft',n_jobs=-1)
        }

    def load_data(self, size=0):
        self.prep.load(size)

    def train(self, X, y, count_vect, clf, partial=False):
        X_count = count_vect.transform(X)
        X_tfidf = TfidfTransformer().fit_transform(X_count)

        if partial:
            clf.partial_fit(X_tfidf, y)
        else:
            clf.fit(X_tfidf, y)

    def predict(self, X, count_vect, clf):
        X_count = count_vect.transform(X)
        X_tfidf = TfidfTransformer().fit_transform(X_count)
        return clf.predict(X_tfidf)

    def train_all_tag(self):
        train_tag_list = []
        heightest_score = defaultdict(float)
        heightest_info = {}
        heightest_model = {}
        all_tag_idx = self.prep.get_all_tag_idx()

        for target_tag in all_tag_idx:
            for model_name in self.models:
                X_train, y_train, X_test, y_test = self.prep.get_train_test_data_tag(
                    target_tag)

                if len(y_train) < 200:
                    print "[Train] not enough (%3d less than 100) sample for '%s'" % (
                        len(y_train), self.prep.get_target_names()
                        [target_tag].encode('utf-8'))
                    continue

                # use only content from (paragraph_id,content)
                X_train = [data[1] for data in X_train]
                X_test = [data[1] for data in X_test]

                train_tag_list.append(target_tag)
                print "train %s" % target_tag
                self.train(X_train, y_train, self.count_vect,
                           self.models[model_name], False)

                predicted = self.predict(X_test, self.count_vect,
                                         self.models[model_name])
                score = np.mean(predicted == y_test)

                matrix = metrics.precision_recall_fscore_support(
                    y_test, predicted, average='binary', pos_label=target_tag)
                precision = matrix[0]
                recall = matrix[1]
                f1 = matrix[2]

                print "%s score %.2f (%s)-[%d/%d]" % (
                    model_name, score,
                    self.prep.get_target_names()[target_tag].encode('utf-8'),
                    len(y_train), len(y_test))
                print "precision=%.2f, recall=%.2f, *** f1=%.2f ***" % (
                    precision, recall, f1)
                print

                if heightest_score[target_tag] < f1:
                    heightest_score[target_tag] = f1
                    heightest_info[target_tag] = (f1, precision, recall, score,
                                                  model_name)
                    heightest_model[target_tag] = deepcopy(
                        self.models[model_name])

        db = DB()
        db.clear_model_score()
        for tag in set(train_tag_list):
            if not tag in heightest_info:
                continue

            output_filename = os.path.join(self.model_file_dir,
                                           "%s.model" % tag)
            print "[Train] save model '%s' (%s) with F1=%.2f, precision=%.2f, recall=%.2f, score=%.2f : %s" % (
                output_filename,
                self.prep.get_target_names()[tag].encode('utf-8'),
                heightest_info[tag][0], heightest_info[tag][1],
                heightest_info[tag][2], heightest_info[tag][3],
                heightest_info[tag][4])
            joblib.dump(heightest_model[tag], output_filename)
            db.add_model_info(tag, output_filename, heightest_score[tag])

        return heightest_info
Exemplo n.º 3
0
def main(args):
    model = get_model(args)
    if args.task == 'validate':
        X_Train = load_csv(args.train_X)
        T_Train = load_csv(args.train_T).flatten()
        X_Train_phi, phi = preprocess(args, X_Train, T_Train)

        logging.info('Training')
        model.validate(X_Train_phi, T_Train, params=get_param_validate(args))
    elif args.task == 'train':
        X_Train = load_csv(args.train_X)
        T_Train = load_csv(args.train_T).flatten()
        X_Train_phi, phi = preprocess(args, X_Train, T_Train)
        inds = range(len(X_Train))
        np.random.shuffle(inds)
        X_Train_phi = X_Train_phi[inds]
        T_Train = T_Train[inds]

        logging.info('Training')
        model.train(X_Train_phi, T_Train, param=get_param(args))

        train_acc = model.eval(X_Train_phi, T_Train)
        logging.info('Training Accuracy = %f' % train_acc)

        if args.test_X != None and args.test_T != None:
            X_Test = load_csv(args.test_X)
            T_Test = load_csv(args.test_T).flatten()
            X_Test_phi = phi.transform(X_Test)
            test_acc = model.eval(X_Test_phi, T_Test)
            logging.info('Testing Accuracy = %f' % test_acc)

            print(test_acc)

        if args.save != None:
            model.save('%s' % args.save)
            logging.info('Model saved at %s' % args.save)

            phi.save('%s' % args.save + '_phi')
            logging.info('Model preprocessor saved at %s' % args.save + '_phi')

    elif args.task == 'plot':
        model.load(args.load)
        logging.info('Model loaded from %s' % args.load)
        logging.info('Plotting')
        l_tree = model.model.estimators_
        plot_decision_tree(l_tree, args.dot)

    elif args.task == 'dt_eval':
        phi = Preprocessor()
        phi.load(args.load + '_phi')

        X_Test = load_csv(args.test_X)
        T_Test = load_csv(args.test_T).flatten()
        X_Test_phi = phi.transform(X_Test)

        model.load(args.load)
        logging.info('Model loaded from %s' % args.load)
        logging.info('Decision Tree Evaluating')
        l_tree = model.model.estimators_
        for tree in l_tree:
            test_acc = tree.score(X_Test_phi, T_Test)
            print('%f' % test_acc)