def multinomial_nb(which_comments):
    print("=> Multinomial Bayes naive classifier")

    data_frame = pd.read_excel(r'preprocessed_data\all_comments.xlsx')

    sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1)
    index = 1
    average = 0
    for train_index, test_index in sss.split(data_frame['Comment'],
                                             data_frame['Type']):
        preprocessing.preprocess_train_test_data(train_index, test_index)

        mnb = MultinomialNB()
        mnb.fit(preprocessing.get_data_set(), preprocessing.get_data_labels())
        score = f1_score(preprocessing.get_test_labels(),
                         mnb.predict(preprocessing.get_test_set()),
                         average='weighted')
        average = average + score

        print("Score {}.: {:.2f}%".format(index, score * 100), end=" ")
        if index == 5:
            print()
        index += 1

    print()
    print("Average: {:.2f}%".format(average / 10 * 100))
예제 #2
0
def compare_regularisation_functions(data_frame, rf, c=1.0):
    sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1)
    index = 1
    average = 0
    for train_index, test_index in sss.split(data_frame['Comment'], data_frame['Type']):
        preprocessing.preprocess_train_test_data(train_index, test_index)

        if rf == 'l1':
            solver = 'saga'
        else:
            solver = 'lbfgs'

        lr = LogisticRegression(penalty=rf, C=c, solver=solver, max_iter=15000)
        lr.fit(preprocessing.get_data_set(), preprocessing.get_data_labels())
        score = f1_score(preprocessing.get_test_labels(), lr.predict(preprocessing.get_test_set()), average='weighted')

        average = average + score
        print("Score({}) {}.: {:.2f}%".format(rf.upper(), index, score * 100), end=" ")

        if index == 5:
            print()
        index += 1

    print()
    print("Average: {:.2f}%".format(average / 10 * 100))
예제 #3
0
def optimize_c_parameter():
    models_param = {
        'max_iter': [15000],
        'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
    }

    nested_cv_search = NestedCV(model=LogisticRegression(), params_grid=models_param,
                                outer_kfolds=5, inner_kfolds=5,
                                cv_options={'sqrt_of_score': True, 'randomized_search_iter': 30})

    nested_cv_search.fit(preprocessing.get_data_set(), preprocessing.get_data_labels())

    optimized_c_value = np.mean([d['C'] for d in nested_cv_search.best_inner_params_list])
    print("Optimized C: {:.3f}".format(optimized_c_value))
예제 #4
0
def compare_regularisation_functions(data_frame, rf, c=1):
    sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1)
    index = 1
    average = 0
    for train_index, test_index in sss.split(data_frame['Comment'],
                                             data_frame['Type']):
        preprocessing.preprocess_train_test_data(train_index, test_index)

        svc = LinearSVC(penalty=rf, C=c, dual=rf == 'l2', max_iter=15000)
        svc.fit(preprocessing.get_data_set(), preprocessing.get_data_labels())
        score = f1_score(preprocessing.get_test_labels(),
                         svc.predict(preprocessing.get_test_set()),
                         average='weighted')

        average = average + score
        print("Score({}) {}.: {:.2f}%".format(rf.upper(), index, score * 100),
              end=" ")

        if index == 5:
            print()
        index += 1

    print()
    print("Average: {:.2f}%".format(average / 10 * 100))