Exemplo n.º 1
0
class content_based_classifier:

    def __init__(self,test_size=0.4):
        self.tag_set_=['0','1','2','3','4']
        self.tag_dict_={
            '0':'introduction',
            '1':'related_work',
            '2':'method',
            '3':'result',
            '4':'conclusion'
        }
        self.labels_ =[self.tag_dict_[i] for i in sorted(self.tag_dict_.keys())]
        self.test_size_=test_size
        self.vec_= TfidfVectorizer(tokenizer=tokenize, stop_words='english', max_df=0.5)

    #set dataset and get train text
    def set_dataset(self,data_path):
        data=json.loads(open(data_path).read().strip())
        X=[]
        y=[]
        label_dict=defaultdict(int)
        for sample in data['data']:
            y.append(int(sample['header']))
            label_dict[int(sample['header'])]+=1
            X.append(sample['content'])

        for label in sorted(label_dict.keys()):
            logging.info('{:}:{:}'.format(label,label_dict[label]))

        self.X_=self.vec_.fit_transform(X)
        self.y_=y
        logging.info('training data loading complete! length:{:}'.format(self.X_.shape[0]))

    #learn feature selection model
    def learn_FS_model(self):
        clf = ExtraTreesClassifier()
        clf.fit(self.X_,self.y_)
        self.fs_model_ = SelectFromModel(clf, prefit=True)
        self.X_ = self.fs_model_.transform(self.X_)

    def feature_selection(self,X):
        self.fs_model_.transfrom(X)


    def split_train_test(self):
        self.X_train_,self.X_test_,self.y_train_,self.y_test_=train_test_split(self.X_,self.y_,test_size=self.test_size_,random_state=0)

    #set the classifier used
    def set_classifier(self,clf):
        self.clf_=clf

    #set parameter space
    def set_param_space(self,params_space):
        self.params_space_ = params_space

    def search_hyper_parameter(self,n_iter=5000):
      
        rs = RandomizedSearchCV(self.clf_, self.params_space_,
                        cv=3,
                        verbose=1,
                        n_jobs=4,
                        n_iter=n_iter)

        rs.fit(self.X_train_, self.y_train_)

        # crf = rs.best_estimator_
        logging.info('best params:{:}'.format(rs.best_params_))
        logging.info('best CV score:{:}'.format(rs.best_score_))
        # logging.info('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

        return rs.best_estimator_

    def save_feature_vector(self,name):
        joblib.dump(self.vec_,'{:}-vec.pkl'.format(name))
        logging.info('feature extraction vector saved to {:}-vec.pkl'.format(name))

    def save_fs_model(self,name):
        joblib.dump(self.fs_model_,'{:}-fsm.pkl'.format(name))
        logging.info('feature selection vector saved to {:}-fsm.pkl'.format(name))


    def save_model(self,clf,name):
        joblib.dump(clf,'{:}-model.pkl'.format(name))
        logging.info('trained model saved to {:}-model.pkl'.format(name))

    # train and test
    def train_and_test(self,params_space,clf):
        logging.info('==== STARTING TO TRAIN MODELS ====')
        logging.info('---- SET PARAMS SPACE ----')
        self.set_param_space(params_space)
        logging.info('---- SET CLASSIFIER ----')
        self.set_classifier(clf)
        logging.info('---- SEARCH HYPER PARAMETERS ON training dataset ----')
        best_clf = self.search_hyper_parameter()  
        logging.info('---- TEST TRAINED MODEL ON testing dataset ----')
        y_pred = best_clf.predict(self.X_test_)
        print(classification_report(self.y_test_,y_pred, target_names=self.labels_, digits=4))
        return best_clf