def train_predict_test(subject,clf,X,X_test,enhance_size = 0): filenames_grouped_by_hour = cPickle.load(open('filenames.pickle')) data_grouped_by_hour = load_grouped_train_data('preprocessed/cnn/', subject, filenames_grouped_by_hour) X, y = generate_overlapped_data(data_grouped_by_hour, overlap_size=10, window_size=X.shape[-1], overlap_interictal=True, overlap_preictal=True) X, scalers = scale_across_time(X, x_test=None) X_test, _ = scale_across_time(X_test, x_test=None, scalers=scalers) print X.shape X = X.reshape(X.shape[0],X.shape[1]*X.shape[2]*X.shape[3]) X_test = X_test.reshape(X_test.shape[0],X_test.shape[1]*X_test.shape[2]*X_test.shape[3]) X,xt,y,yt = train_test_split(X,y,test_size = .25) print "train size", X.shape print "test_size", xt.shape #print "done loading" clf.fit(X) preds_proba = clf.predict(X_test) #print preds_proba.shape validation_preds = clf.predict(xt) return preds_proba,list(validation_preds),list(yt)
def get_model_data(name): cfgs = config.model_configs(name) data = datasets.load_dataset(cfgs['dataset']) if 'target' in cfgs: target = datasets.load_dataset(cfgs['target']) n_train = target.shape[0] train_data, test_data = cross_validation.train_test_split(data, n_train) data = ((train_data, target), test_data) return data
def get_model_data(name): cfgs = config.model_configs(name) data = datasets.load_dataset(cfgs['dataset']) if 'target' in cfgs: target = datasets.load_dataset(cfgs['target']) n_train = target.shape[0] train_data, test_data = cross_validation.train_test_split( data, n_train) data = ((train_data, target), test_data) return data
def split_evenly(X,y,test_size = .25): preictal_indices = y == 1 interictal_indices = y ==0 X_p = X[preictal_indices] X_i = X[interictal_indices] y_p = y[preictal_indices] y_i = y[interictal_indices] num_p = X_p.shape[0] * .25 test_size_i = num_p / X_i.shape[0] X_p_train, X_p_test,y_p_train,y_p_test = train_test_split(X_p,y_p,test_size=.25,random_state = 33) X_i_train, X_i_test,y_i_train,y_i_test = train_test_split(X_i,y_i,test_size=test_size_i,random_state = 39) X = np.vstack((X_p_train,X_i_train)) Xt = np.vstack((X_p_test,X_i_test)) y = np.append(y_p_train,y_i_train) yt = np.append(y_p_test,y_i_test) return X,Xt,y,yt
def train_predict_test_cnn(subject,clf,X,X_test,enhance_size = 0): filenames_grouped_by_hour = cPickle.load(open('filenames.pickle')) data_grouped_by_hour = load_grouped_train_data('preprocessed/cnn/', subject, filenames_grouped_by_hour) X, y = generate_overlapped_data(data_grouped_by_hour, overlap_size=10, window_size=X.shape[-1], overlap_interictal=True, overlap_preictal=True) X, scalers = scale_across_time(X, x_test=None) X_test, _ = scale_across_time(X_test, x_test=None, scalers=scalers) # X,xt,y,yt = split_evenly(X,y,test_size = .5) # if enhance_size > 0: # X,y = enhance_data(X,y,enhance_size,cnn=True) # xt,yt = enhance_data(xt,yt,enhance_size/2,cnn=True) X, xt, y, yt = train_test_split(X, y, test_size=0.25, random_state=42) print "train size", X.shape print "test_size", xt.shape preds_proba = np.zeros(( X.shape[1], X_test.shape[0] )) val_proba = np.zeros(( xt.shape[1], xt.shape[0] )) weighting = np.zeros((X.shape[1],)) for i in range(0, X.shape[1]): print "Progress: " + str(100*i/X.shape[1]) + '%' X_train = X[:,i,:,:] xt_train = xt[:,i,:,:] weighting[i], val_proba[i,] = clf.fit(X_train,y,xt_train,yt) train_loss = np.array([]) valid_loss = np.array([]) X_test_subset = X_test[:,i,:,:] preds_proba[i,] = clf.predict_proba(X_test_subset) #idx = np.argmax(weighting) sc = np.amax(weighting) print "Best score:" + str(sc) weighting -= weighting.min() weighting /= weighting.sum() #preds_proba = preds_proba[idx,] preds_proba = np.average(preds_proba, axis=0, weights=weighting) preds_scaled = preds_proba #preds_scaled = min_max_scale(preds_proba) #validation_preds = val_proba[idx,] validation_preds = np.average(val_proba, axis=0, weights=weighting) return preds_scaled,preds_proba,list(validation_preds),list(yt),train_loss,valid_loss
tmp_prob += self.Pwv[cls][word] # wordset.add(word) if tmp_prob > max_prob: max_prob = tmp_prob result = cls # print('Pr:', result, max_prob, end=' ') return result if __name__ == '__main__': # print(os.path.dirname(os.path.realpath(__file__))) parser = argparse.ArgumentParser("Read in data directory") parser.add_argument('data_dir') files, cls = fe.get_file_name_and_path(parser.parse_args().data_dir) train_X, train_Y, test_X, test_Y = train_test_split(files, cls, 0.25) nb = NaiveBayes() nb.fit(train_X, train_Y, 40) # lst = [] # for i in range(1, 100): # nb.fit(train_X, train_Y, i) # y_pred = nb.predict_list(test_X) # lst.append(f1_score(test_Y, y_pred)) # print(i,lst[-1]) # do_plot(0, lst) save_learner = open('naive_bayes.pkl','wb') pickle.dump(nb,save_learner) # load_learner = open('naive_bayes.pkl', 'rb') # nb = pickle.load(load_learner)
learner.fit(x, features.y_transform(train_Y)) x = [] for f_name in test_X: x.append(features.get_x_vector(f_name, weight)) scores[ii] = f1_score(features.y_transform(test_Y), learner.predict(x).tolist()) return mean(scores) if __name__ == '__main__': parser = argparse.ArgumentParser("Read in data directory") parser.add_argument('data_dir') print('Reading Data Path') files, dirs = fe.get_file_name_and_path(parser.parse_args().data_dir) print('Spliting Train-Test Set ') train_x, train_y, test_x, test_y = train_test_split(files, dirs, 0.25) learner = svm.SVC(kernel='rbf', C=1) features = Features(train_x, train_y, 3, 0, 160) x = [] for f_name in train_x: x.append(features.get_x_vector(f_name, 'tfidf')) learner.fit(x, features.y_transform(train_y)) x=[] for f_name in test_x: x.append(features.get_x_vector(f_name, 'tfidf')) print('Score:', f1_score(features.y_transform(test_y), learner.predict(x).tolist())) # print('Test if "TFIDF" is better than "TF"')
eeg_data = pd.read_csv('EEG_Eye_State.csv', header=None) # remove examples with extreme outliers (only 3 rows out of 15,000) eeg_data = eeg_data[(eeg_data <= 10000).all(axis=1)] X = eeg_data.iloc[:,:-1] y = eeg_data.iloc[:,-1].reshape(-1,1) print X.shape print y.shape from sklearn.preprocessing import StandardScaler X = StandardScaler().fit_transform(X) # use my custom data splitter for training/test X_train, X_test, y_train, y_test = train_test_split(X, y, seed=0, test_size = 0.25) start_time = timeit.timeit() print "starting fit" nn = NeuralNetwork(n_iter = 2, n_print = 5, learning_rate = 1,\ num_hidden_units=24, seed=42, verbose = False) nn.fit(X_train, y_train) end_time = timeit.timeit() print "Fitting time: {}".format(end_time - start_time) print "W matrix (size = {} x {}) = {}".format(nn.W.shape[0],nn.W.shape[1],nn.W) print "V matrix (size = {} x {}) = {}".format(nn.V.shape[0],nn.V.shape[1],nn.V)