# -*- coding: utf-8 -*- """ Created on Mon Oct 29 17:17:02 2018 @author: Erik """ from keras.models import Sequential from keras.layers import Dense from keras.optimizers import SGD from get_data import get_data_tfidf, one_hot_encode #used to split data from sklearn.model_selection import train_test_split #file name, max gram length, min occurances of gram #for me get_data('data-1_train.csv', 3, 3) is around 68-70% accuracy on test, which is actually great! X, y = get_data_tfidf('data-2_train.csv') y = one_hot_encode(y) #split as required X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=7) X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.2, random_state=7) ffnn = Sequential() ffnn.add(Dense(8, input_dim=len(X_train[0]), activation='relu')) #add a second hidden layer, usually fewer and fewer nodes per hidden layer, this is such a small example it's way overdone
from sklearn.naive_bayes import GaussianNB from keras.optimizers import SGD from keras.regularizers import l1, l2, l1_l2 DATA_SETS = ['data-1_train.csv', 'data-2_train.csv'] ALGOS = ['nn', 'nb', 'dt', 'rf'] PRE_PROCS = ['tfidf', 'cust'] file = open('test.csv', 'w') file.write("test") for ds in DATA_SETS: for alg in ALGOS: for proc in PRE_PROCS: if proc == 'tfidf': X, y = get_data_tfidf(ds) else: X, y = get_data_custom(ds, 2, 0, False) y_encode = one_hot_encode(y) kf = KFold(n_splits=10) kf.get_n_splits(X) scores = [] print("Working on: " + ds + " " + alg + " " + proc) i = 1 for train_index, test_index in kf.split(X): print(i) i += 1 X_train, X_test = X[train_index], X[test_index]