def run(analyzer, ngrams, lowercase, stop_words, max_df, min_df, norm, use_idf, smooth_idf, sublinear_tf): def train_model(X_train, y_train, stat_model): #stat_model.optimize_hyperparameters(X_train, y_train, folds=5) # params = stat_model.best_params params = {'kernel': 'rbf', 'C': 10, 'probability': True, 'gamma': 0.1} #params = {'kernel': 'rbf', 'C': 1, 'probability': True, 'gamma': 0.001} #with scale #params = {'alpha': alpha} #print params stat_model.train(X_train, y_train, params) def test_model(stat_model, X_test, y_test, fscore_list): y_true, y_pred = y_test, stat_model.predict(X_test) fscore_list.append(accuracy_score(y_true, y_pred)) print(fscore_list[-1]) # load data # data data = News() #data = Spam() data_train_x, data_train_y = data.get_train() data_test_x, data_test_y = data.get_test() #print newsgroups_train featurizer = TFIDF(analyzer, ngrams, lowercase, stop_words, max_df, min_df, norm, use_idf, smooth_idf, sublinear_tf) (x, y, X_test, y_test) = featurizer.featurize(data_train_x, data_train_y, data_test_x, data_test_y) #incremental training stat_model = SVC_Model() #stat_model = NaiveBayes() X_train = None y_train = [] accuracy_list = [0] start_time = time.time() #print x.shape[0] # first model train_model(x, y, stat_model) test_model(stat_model, X_test, y_test, accuracy_list) #print featurizer.pipeline.get_params() print("--- %s seconds ---" % (time.time() - start_time)) return stat_model.model.score(X_test, y_test)
y_true, y_pred = y_test, stat_model.predict(X_test) fscore_list.append(accuracy_score(y_true, y_pred)) print(fscore_list[-1]) # data data = News() #data = Spam() data_train_x, data_train_y = data.get_train() data_test_x, data_test_y = data.get_test() random_seed = 42 # specify TF-IDF featurizer = TFIDF() (x, y, X_test, y_test) = featurizer.featurize(data_train_x, data_train_y, data_test_x, data_test_y) print x.shape[0] ''' y_encoder = LabelBinarizer() y_encoder.fit(y) y_one_hot = y_encoder.transform(y) print y_one_hot.shape print x.shape all_data = hstack((x, y_one_hot)) print all_data.shape '''
use_idf true ''' #print newsgroups_train analyzer = 'char_wb' lowercase = True max_df = 1.0 min_df = 0.01 ngrams = 2 norm = "l2" smooth_idf = True stop_words = None sublinear_tf = False use_idf = True featurizer = TFIDF(analyzer, ngrams, lowercase, stop_words, max_df, min_df, norm, use_idf, smooth_idf, sublinear_tf) (x, y, X_test, y_test) = featurizer.featurize(data_train_x, data_train_y, data_test_x, data_test_y) all_list = [] time_list_all = [] sum_uncertainty_all = [] #incremental training for run in range(1): stat_model = SVC_Model() random_seed += 1