예제 #1
0
파일: train_all.py 프로젝트: BigDaMa/DFS
def run(analyzer, ngrams, lowercase, stop_words, max_df, min_df, norm, use_idf,
        smooth_idf, sublinear_tf):
    def train_model(X_train, y_train, stat_model):
        #stat_model.optimize_hyperparameters(X_train, y_train, folds=5)
        # params = stat_model.best_params

        params = {'kernel': 'rbf', 'C': 10, 'probability': True, 'gamma': 0.1}
        #params = {'kernel': 'rbf', 'C': 1, 'probability': True, 'gamma': 0.001} #with scale
        #params = {'alpha': alpha}
        #print params

        stat_model.train(X_train, y_train, params)

    def test_model(stat_model, X_test, y_test, fscore_list):
        y_true, y_pred = y_test, stat_model.predict(X_test)

        fscore_list.append(accuracy_score(y_true, y_pred))
        print(fscore_list[-1])

    # load data
    # data
    data = News()
    #data = Spam()
    data_train_x, data_train_y = data.get_train()
    data_test_x, data_test_y = data.get_test()

    #print newsgroups_train

    featurizer = TFIDF(analyzer, ngrams, lowercase, stop_words, max_df, min_df,
                       norm, use_idf, smooth_idf, sublinear_tf)
    (x, y, X_test, y_test) = featurizer.featurize(data_train_x, data_train_y,
                                                  data_test_x, data_test_y)
    #incremental training

    stat_model = SVC_Model()
    #stat_model = NaiveBayes()

    X_train = None
    y_train = []

    accuracy_list = [0]

    start_time = time.time()

    #print x.shape[0]

    # first model

    train_model(x, y, stat_model)
    test_model(stat_model, X_test, y_test, accuracy_list)

    #print featurizer.pipeline.get_params()

    print("--- %s seconds ---" % (time.time() - start_time))

    return stat_model.model.score(X_test, y_test)
예제 #2
0
파일: ZOMBIE.py 프로젝트: BigDaMa/DFS
    y_true, y_pred = y_test, stat_model.predict(X_test)

    fscore_list.append(accuracy_score(y_true, y_pred))
    print(fscore_list[-1])


# data
data = News()
#data = Spam()
data_train_x, data_train_y = data.get_train()
data_test_x, data_test_y = data.get_test()

random_seed = 42

# specify TF-IDF
featurizer = TFIDF()
(x, y, X_test, y_test) = featurizer.featurize(data_train_x, data_train_y,
                                              data_test_x, data_test_y)

print x.shape[0]
'''
y_encoder = LabelBinarizer()
y_encoder.fit(y)
y_one_hot = y_encoder.transform(y)

print y_one_hot.shape
print x.shape

all_data = hstack((x, y_one_hot))
print all_data.shape
'''
예제 #3
0
use_idf	true
'''

#print newsgroups_train
analyzer = 'char_wb'
lowercase = True
max_df = 1.0
min_df = 0.01
ngrams = 2
norm = "l2"
smooth_idf = True
stop_words = None
sublinear_tf = False
use_idf = True

featurizer = TFIDF(analyzer, ngrams, lowercase, stop_words, max_df, min_df,
                   norm, use_idf, smooth_idf, sublinear_tf)
(x, y, X_test, y_test) = featurizer.featurize(data_train_x, data_train_y,
                                              data_test_x, data_test_y)

all_list = []
time_list_all = []
sum_uncertainty_all = []

#incremental training

for run in range(1):

    stat_model = SVC_Model()

    random_seed += 1