class Factory(object): def __init__(self, config): self.config = config self.analyzer = Analyzer(self.config) self.classify = Classify(config) @staticmethod def get_all_column_data(file): """ Combine all column data into a single feature matrix :param file: :return: """ # Get all the feature matrices title_matrix, response_vector = f.analyze_column_data(file, 'title') abstract_matrix, response_vector = f.analyze_column_data( file, 'abstract') claims_matrix, response_vector = f.analyze_column_data(file, 'claims') # Get them all together feature_matrix = hstack([title_matrix, abstract_matrix]) feature_matrix = hstack([feature_matrix, claims_matrix]) return feature_matrix, response_vector def analyze_column_data(self, filename, column_name): """ Create the feature model and matrix for the abstract column :param filename: :return: """ self.analyzer.load_patent_data(filename) self.analyzer.extract_data(column_name) n_grams = 1 self.analyzer.extract_features(n_grams, column_name) return self.analyzer.feature_matrix, self.analyzer.response def compute_heuristics(self, filename, column_name): """ Figure out what words make up the groups in the shit :param filename: :return: """ self.analyze_column_data(filename, column_name) self.analyzer.heuristics(column_name) def full_train(self): """ GET THE CLASSIFIER TRAINED :return: """ # self.classify.feature_selection() self.classify.classifier_selection() # self.classify.optimize_classifier() self.classify.train() self.classify.save_classifier() def evaluate(self, title, abstract, claims): """ Predict group of a single entry :param abstract: :return: """ self.analyzer.load_model('title') title_vector = self.analyzer.transform([title]) self.analyzer.load_model('abstract') abstract_vector = self.analyzer.transform([abstract]) self.analyzer.load_model('claims') claims_vector = self.analyzer.transform([claims]) feature_vector = hstack([title_vector, abstract_vector]) feature_vector = hstack([feature_vector, claims_vector]) return feature_vector def predict(self, feature_vector): """ Predict class based on feature vector input :param feature_vector: :return: """ group = self.classify.predict(feature_vector) return group
#-*- coding:utf-8 -*- from classify import Classify import numpy as np import sys reload(sys) sys.setdefaultencoding("utf8") if __name__ == "__main__": X_train = np.array([ u"我想听张学友的歌", u"周杰伦的龙卷风", u"鹿晗有什么歌好听", u"姚明打篮球好厉害", u"张继科会打乒乓球", u"詹姆士是体育明星" ]) Y_train = np.array([1, 1, 1, 2, 2, 2]) Test_data = [u"我想听薛之谦的演员", "邓亚萍是体育明星", "刘翔是体育明星"] Model = Classify() Model.load_W2V_Model("word2vec.model") Model.train(X_train, Y_train) Model.predict(Test_data) Model.save_NBmodel("NB.model") del Model NBmodel_test = Classify() NBmodel_test.load_NBmodel("NB.model") NBmodel_test.predict(Test_data) del NBmodel_test