def main(generate='xgb', scaling=False): """ Runs the different processing steps to generate a solution in csv format""" print("loading data in memory...") (df_train, target, df_test) = load_data() print("preparing datasets...") (X, y, X_test, label_encoder) = prepare_datasets(df_train, df_test, target) if generate == 'benchmark1': print("Generating submission...") return generate_benchmark1_submission(df_train, target, df_test['id']) elif generate == 'benchmark2': print("Generating submission...") return generate_benchmark2_submission(df_train, target, df_test['id']) else: if scaling: print("Transforming datasets...") (X, scaler, transformer) = feature_transformation(X) X_test = transformer.transform(scaler.transform(X_test)) print("Training classifier...") clf = train_classifier(X, y) print("Generating submission...") return generate_submission( clf, X_test, label_encoder, df_test['id'], clf_name=generate)
def train_model_validation(filename_train_validation_set, filename_labels_train_validation_set, filter_density, dropout, input_shape, output_shape, file_path_model, filename_log, channel=1): """ train model with validation """ filenames_train, Y_train, \ filenames_validation, Y_validation, \ filenames_features, Y_train_validation = \ load_data(filename_labels_train_validation_set) model_0 = model_switcher(filter_density, dropout, input_shape, output_shape) # print(model_0.summary()) batch_size = 256 patience = 15 # print(model_0.count_params()) model_train_validation(model_0, batch_size, patience, input_shape, filename_train_validation_set, filenames_train, Y_train, filenames_validation, Y_validation, file_path_model, filename_log, channel)
def load_model_data(): ''' Loads the model datasets from the respective files Returns: results -- dictionary containing the following train_x_nb -- training x for naive bayes data set train_y -- training y data set test_x_nb -- test x for naive bayes data set test_y -- test y data set train_x_nn -- training x for neural nets (& Logistic regression) data set test_x_nn -- test x for neural nets (& Logistic regression) data set ''' _, train_x_nb, _, _, _ = load_data(constants.train_x_nb_file, [], []) _, train_y, _, _, _ = load_data(constants.train_y_file, [], []) _, test_x_nb, _, _, _ = load_data(constants.test_x_nb_file, [], []) _, test_y, _, _, _ = load_data(constants.test_y_file, [], []) _, train_x_nn, _, _, _ = load_data(constants.train_x_nn_file, [], []) _, test_x_nn, _, _, _ = load_data(constants.test_x_nn_file, [], []) results = {} results['train_x_nb'] = train_x_nb results['train_y'] = train_y results['test_x_nb'] = test_x_nb results['test_y'] = test_y results['train_x_nn'] = train_x_nn results['test_x_nn'] = test_x_nn return results
def datasets(scaling=False): """ Prepares the datasets and returns (X_train, X_test, y_train, y_test)""" print("loading data in memory...") (df_train, target, df_test) = load_data() print("preparing datasets...") (X, y, _, _) = prepare_datasets(df_train, df_test, target) if scaling: print("Transforming datasets...") (X, _, _) = feature_transformation(X) return train_test_split(X, y, test_size=.30, random_state=42)
def test_load_data(self): """ Tests the load data funciton. required TESTING_data.csv file Reads from test file TESTING_data.csv, and verifies contents """ header_orig_data, data_orig_data, _ = load_data( './datasets/TESTING_data.csv') header_array = np.array(['Item1', 'Item2', 'Item3', 'Item4']) # load_data is transposing this item header_array = header_array nptest.assert_array_equal(header_orig_data, header_array) data_array = np.array([[ 1, 2, 3, 4, ], [5, 6, 7, 8]]) # load_data is transposing this item data_array = data_array nptest.assert_array_equal(data_orig_data, data_array)
def test_partition_data(self): """ Tests the partition data funciton. required TESTING_partition_data.csv file """ _, data_orig_data, _ = load_data( './datasets/TESTING_partition_data.csv') train_set, test_set, dev_set = partition_data(data_orig_data) self.assertEqual(train_set.shape[0], 6) self.assertEqual(test_set.shape[0], 2) self.assertEqual(dev_set.shape[0], 2) # Test a different partition configuration train_set, test_set, dev_set = partition_data(data_orig_data, 0.8, 0.1) self.assertEqual(train_set.shape[0], 8) self.assertEqual(test_set.shape[0], 1) self.assertEqual(dev_set.shape[0], 1) # Test a configuration with no dev train_set, test_set, dev_set = partition_data(data_orig_data, 0.9, 0.1) self.assertEqual(train_set.shape[0], 9) self.assertEqual(test_set.shape[0], 1) self.assertEqual(dev_set.shape[0], 0)
def data_preparation(filename): """ Executed only once to create train, test and dev datasets. Arguments: filename -- the file of the original data Returns: This function creates three files hr_train.csv -- the training dataset hr_test.csv -- the testing dataset hr_dev.csv -- the development dataset """ # load originial dataset hr_header_orig, hr_data_orig, _, _, _ = load_data(filename, [], []) # partition data 60/20/20 hr_train, hr_test, hr_dev = partition_data(hr_data_orig) # save results in files save_data('./datasets/hr_train.csv', hr_header_orig, hr_train) save_data('./datasets/hr_test.csv', hr_header_orig, hr_test) save_data('./datasets/hr_dev.csv', hr_header_orig, hr_dev)
from layers import Softmax, Linear, RNN#, LSTM from loss import CrossEntropyLoss from optimizer import SGDOptimizer from network import Network from data_preparation import load_data from solve_rnn import solve_rnn import theano.tensor as T X_train, y_train, X_test, y_test = load_data() HIDDEN_DIM = 32 INPUT_DIM = 20 OUTPUT_DIM = 10 model = Network() model.add(RNN('rnn1', HIDDEN_DIM, INPUT_DIM, 0.1)) # output shape: 4 x HIDDEN_DIM model.add(Linear('fc', HIDDEN_DIM, OUTPUT_DIM, 0.1)) # output shape: 4 x OUTPUT_DIM model.add(Softmax('softmax')) loss = CrossEntropyLoss('xent') optim = SGDOptimizer(0.01, 0.0001, 0.9) input_placeholder = T.fmatrix('input') label_placeholder = T.fmatrix('label') model.compile(input_placeholder, label_placeholder, loss, optim) MAX_EPOCH = 6 DISP_FREQ = 1000 TEST_FREQ = 10000
from __future__ import print_function from data_preparation import load_data from set_session import set_session import resnet import keras import os batch_size = 64 epochs = 150 im_dir = "../dataset/images/" train_im_list = "../dataset/SUNAttributeDB/train.txt" test_im_list = "../dataset/SUNAttributeDB/test.txt" set_session(0) # The data, shuffled and split between train and test sets: (x_train, y_train), (x_test, y_test) = load_data(im_dir, train_im_list, test_im_list) model = resnet.ResnetBuilder.build_resnet_34((3, 224, 224), 102) # initiate SGD optimizer opt = keras.optimizers.SGD(lr=0.05, decay=1e-6) # Let's train the model using SGD model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy']) print('Not using data augmentation.') checkpoint = keras.callbacks.ModelCheckpoint( filepath='./checkpoint/checkpoint-{epoch:02d}-{val_loss:.2f}.h5') model.fit(x_train, y_train, batch_size=batch_size,
currentDT = str(datetime.datetime.now()).replace(':', '_').replace('.', '_').replace(' ', '_') print(currentDT) # ======================================================= DATA Preparation ============================================= # We need the data as : an annotation dataframe, a documents dataframe, and, # if training, to divide the documents between train_docs and test_docs # the load_data function can output these if given an annotations dataframe and the paht to the files (see required format in data_preparation) all_annotations, documents, train_docs, test_docs = load_mt_samples() # MT samples data #cris_annotations, cris_documents, cris_train_docs, cris_test_docs = load_data(...) # CRIS data i2b2_annotations, i2b2_documents = load_data('../TimeDatasets/i2b2 Data/i2b2_timexe_annotations.xlsx') test_i2b2_docs = i2b2_documents[i2b2_documents.test == True] train_i2b2_docs = i2b2_documents[i2b2_documents.test == False] print(i2b2_documents) # ============================================= MODEL TRAINING ================================================= # model parameters spacy_type = False #choosing which typing to use. True for DATE and TIME types. other_annotations = False nb_iter = 2 if not spacy_type: output_dir = 'models/all_types_model_' + currentDT else: output_dir = 'models/spacy_types_model_' + currentDT
from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier from xgboost.sklearn import XGBClassifier from sklearn.grid_search import RandomizedSearchCV # , GridSearchCV from sklearn.cross_validation import train_test_split from data_preparation import load_data, prepare_datasets from data_transformation import feature_transformation from metrics import ndcg_scorer from evaluation import score_classifier, display_learning_curves print("loading data in memory...") (df_train, target, df_test) = load_data() print("preparing datasets...") (X, y, X_test, label_encoder) = prepare_datasets(df_train, df_test, target) print("Transforming datasets...") (X_transformed, scaler, transformer) = feature_transformation(X) X_test_scaled = scaler.transform(X_test) X_test_transformed = transformer.transform(X_test_scaled) print("splitting data ") (X_train, X_te, y_train, y_te) = train_test_split( X_transformed, y, test_size=.25, random_state=41) def main():
from analyse import plot_comparison_curve from data_preparation import load_data column_drop_list = [ constants.EMPLYEENO_R, constants.EMPLOYEECOUNT_R, constants.ISOVER18_R, constants.STDHOURS_R ] encode_list = [ constants.GENDER_T, constants.STATUS_T, constants.DEPARTMENT_T, constants.ROLE_T, constants.OVERTIME_T, constants.TRAVEL_T, constants.ISRESIGNED_T, constants.EDUCATION_T ] file_path = '.\\datasets\\WA_Fn-UseC_-HR-Employee-Attrition.csv' header, data, m_header, m_data, analytics = load_data(file_path, encode_list, column_drop_list) #analyse_data_init(header, analytics) features_to_analyse = [ constants.STOCKOPTIONS, constants.TRAINING, constants.SATISFACTION, constants.TEAMCLICK, constants.ROLE_T, constants.LEVEL, constants.DEPARTMENT_T, constants.EDUCATION_T, constants.GENDER_T, constants.COMPANIES, constants.STATUS_T, constants.RATING, constants.LIFEBALANCE, constants.INVOLVEMENT ] features_to_analyse = [ constants.RATING, ]
def prepare_model_data(rewrite_files=False): ''' Prepares the data set files used by the models. Returns: results -- dictionary containing the following train_x_nb -- training x for naive bayes data set train_y -- training y data set test_x_nb -- test x for naive bayes data set test_y -- test y data set train_x_nn -- training x for neural nets (& Logistic regression) data set test_x_nn -- test x for neural nets (& Logistic regression) data set ''' # step 1: column_drop_list = [ constants.EMPLYEENO_R, constants.EMPLOYEECOUNT_R, constants.ISOVER18_R, constants.STDHOURS_R ] encode_list = [ constants.GENDER_T, constants.STATUS_T, constants.DEPARTMENT_T, constants.ROLE_T, constants.OVERTIME_T, constants.TRAVEL_T, constants.ISRESIGNED_T, constants.EDUCATION_T ] _, _, m_header, m_data, _ = load_data(constants.orig_file, encode_list, column_drop_list) save_data(constants.processed_file, m_header, m_data, override=rewrite_files) # step 2: train_data, test_data, _ = partition_data(m_data, 0.8, 0.2) output_idx = np.argwhere(m_header == constants.ISRESIGNED_T).squeeze() train_y = train_data[:, output_idx] train_x = np.delete(train_data, output_idx, 1) test_y = test_data[:, output_idx] test_x = np.delete(test_data, output_idx, 1) traintest_header = np.delete(m_header, output_idx, 0) save_data(constants.train_x_file, traintest_header, train_x, override=rewrite_files) save_data(constants.train_y_file, np.array([constants.ISRESIGNED_T]), train_y, override=rewrite_files) save_data(constants.test_x_file, traintest_header, test_x, override=rewrite_files) save_data(constants.test_y_file, np.array([constants.ISRESIGNED_T]), test_y, override=rewrite_files) # to execute naive bayes we will discretise continuous data column_bins_definition = { constants.AGE: 10, constants.DAILYRATE: 10, constants.HOMEDISTANCE: 10, constants.SALARY: 10, constants.HOURLYRATE: 10, constants.MONTHLYRATE: 10, constants.YEARSEMPLOYED: 5, constants.YEARSCOMPANY: 5, constants.YEARSROLE: 5, constants.YEARSLASTPROMO: 5, constants.YEARSMANAGER: 5, constants.LASTINCREMENTPERCENT: 16 } train_x_nb = digitize_columns(traintest_header, train_x, column_bins_definition) test_x_nb = digitize_columns(traintest_header, test_x, column_bins_definition) save_data(constants.train_x_nb_file, traintest_header, train_x_nb, override=rewrite_files) save_data(constants.test_x_nb_file, traintest_header, test_x_nb, override=rewrite_files) columns_norm = [ constants.AGE, constants.DAILYRATE, constants.HOMEDISTANCE, constants.SALARY, constants.HOURLYRATE, constants.MONTHLYRATE, constants.YEARSEMPLOYED, constants.YEARSCOMPANY, constants.YEARSROLE, constants.YEARSLASTPROMO, constants.YEARSMANAGER, constants.LASTINCREMENTPERCENT ] train_x_lr = feature_scale_columns(traintest_header, train_x, columns_norm) test_x_lr = feature_scale_columns(traintest_header, test_x, columns_norm) save_data(constants.train_x_lr_file, traintest_header, train_x_lr, override=rewrite_files) save_data(constants.test_x_lr_file, traintest_header, test_x_lr, override=rewrite_files) encode_list_nn = [ constants.GENDER_T, constants.STATUS_T, constants.DEPARTMENT_T, constants.ROLE_T, constants.OVERTIME_T, constants.TRAVEL_T, constants.EDUCATION_T, constants.ENVIRONMENT, constants.INVOLVEMENT, constants.LEVEL, constants.SATISFACTION, constants.COMPANIES, constants.RATING, constants.TEAMCLICK, constants.STOCKOPTIONS, constants.TRAINING, constants.LIFEBALANCE ] train_test_header_nn, train_x_nn = encode_columns_nn( traintest_header, train_x_lr, encode_list_nn) _, test_x_nn = encode_columns_nn(traintest_header, test_x_lr, encode_list_nn) save_data(constants.train_x_nn_file, train_test_header_nn, train_x_nn, override=rewrite_files) save_data(constants.test_x_nn_file, train_test_header_nn, test_x_nn, override=rewrite_files) results = {} results['train_x_nb'] = train_x_nb results['train_y'] = train_y results['test_x_nb'] = test_x_nb results['test_y'] = test_y results['train_x_nn'] = train_x_nn results['test_x_nn'] = test_x_nn return results
from predict import digitize_columns from predict import standardize_columns from predict import encode_columns_nn from data_preparation import load_data from data_preparation import save_data orig_file_path = '.\\datasets\\TESTING_ml.csv' processed_file_path = '.\\datasets\\TESTING_ml_proc.csv' nb_file_path = '.\\datasets\\TESTING_ml_nb.csv' lr_file_path = '.\\datasets\\TESTING_ml_lr.csv' nn_file_path = '.\\datasets\\TESTING_ml_nn.csv' column_drop_list = ['Delete'] encode_list = ['Gender', 'Sport'] _, _, header, data, _ = load_data(orig_file_path, encode_list, column_drop_list) save_data(processed_file_path, header, data) column_bins_definition = {'Age': 5, 'Salary': 5} data_nb = digitize_columns(header, data, column_bins_definition) save_data(nb_file_path, header, data_nb) columns_norm = ['Age', 'Salary'] data_lr = standardize_columns(header, data, column_bins_definition) save_data(lr_file_path, header, data_lr) print(header) print(data_lr) encode_list_nn = ['Gender', 'Sport'] header_nn, data_nn = encode_columns_nn(header, data_lr, encode_list_nn) save_data(nn_file_path, header_nn, data_nn)