# -*- coding: utf-8 -*-
"""
Created on Mon Oct 29 17:17:02 2018

@author: Erik
"""
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from get_data import get_data_tfidf, one_hot_encode
#used to split data
from sklearn.model_selection import train_test_split

#file name, max gram length, min occurances of gram
#for me  get_data('data-1_train.csv', 3, 3) is around 68-70% accuracy on test, which is actually great!
X, y = get_data_tfidf('data-2_train.csv')
y = one_hot_encode(y)

#split as required
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=7)
X_train, X_validation, y_train, y_validation = train_test_split(X_train,
                                                                y_train,
                                                                test_size=0.2,
                                                                random_state=7)

ffnn = Sequential()
ffnn.add(Dense(8, input_dim=len(X_train[0]), activation='relu'))
#add a second hidden layer, usually fewer and fewer nodes per hidden layer, this is such a small example it's way overdone
示例#2
0
from sklearn.naive_bayes import GaussianNB
from keras.optimizers import SGD
from keras.regularizers import l1, l2, l1_l2

DATA_SETS = ['data-1_train.csv', 'data-2_train.csv']
ALGOS = ['nn', 'nb', 'dt', 'rf']
PRE_PROCS = ['tfidf', 'cust']

file = open('test.csv', 'w')
file.write("test")

for ds in DATA_SETS:
    for alg in ALGOS:
        for proc in PRE_PROCS:
            if proc == 'tfidf':
                X, y = get_data_tfidf(ds)
            else:
                X, y = get_data_custom(ds, 2, 0, False)

            y_encode = one_hot_encode(y)

            kf = KFold(n_splits=10)
            kf.get_n_splits(X)
            scores = []

            print("Working on: " + ds + " " + alg + " " + proc)
            i = 1
            for train_index, test_index in kf.split(X):
                print(i)
                i += 1
                X_train, X_test = X[train_index], X[test_index]