def main(argv): input_filename_x = 'train_data.csv' input_filename_y = 'train_labels.csv' test_input_filename = 'test_data.csv' svc_model_filename = 'svc_classif.pkl' lr_model_filename = 'lr_classif.pkl' rfc_model_filename = 'rfc_classif.pkl' rfc_feat_imp_filename = 'rfc_feat_imp.png' model_comp_result_chart_filename = 'method_comp_res.png' io = lib.io.IO() viz = lib.viz.Viz() # Read data X_o, y_o = io.read_data(input_filename_x, input_filename_y) test_x = io.read_data(test_input_filename, None) print "There are " + str(len(X_o)) + " samples in the train set." print "There are " + str(len(test_x)) + " samples in the test set." SVC_ll , SVC_a, RFC_ll, RFC_a, LR_ll, LR_a = [], [], [], [], [], [] comps = [10, 20, 50, 100, 150, 200, 264] for s in comps: print("Amount of components: %d"%s) cl = lib.cl.CL(io, viz) X = copy.deepcopy(X_o) y = copy.deepcopy(y_o) test_x = np.matrix(test_x) test_ids = range(1, len(test_x)+1) # Remove outliers X, y = cl.lof(np.matrix(X), np.matrix(y)) # Shuffle X, y = io.shuffle(X, y) # PCA X = cl.pca(np.matrix(X), components=s, filename=None).tolist() # test_x = cl.pca(np.matrix(test_x), components=s, filename=None).tolist() val_ids, val_x, val_y = io.pick_set(X, y, 726) train_ids, train_x, train_y = io.pick_set(X, y, 3200) # Train cl.lr_cl_train(train_x, train_y, filename=lr_model_filename) # Validate ll, a = cl.lr_cl_val(val_x, val_y) LR_ll.append(ll) LR_a.append(a) # Draw some results viz.cross_results(comps, LR_ll, LR_a, 'pca_cross_val.png')
def main(argv): input_filename_x = 'train_data.csv' input_filename_y = 'train_labels.csv' test_input_filename = 'test_data.csv' lr_model_filename = 'lr2_classif.pkl' model_comp_result_chart_filename = 'method_comp_res.png' io = lib.io.IO() viz = lib.viz.Viz() cl = lib.cl.CL(io, viz) # Read data X, y = io.read_data(input_filename_x, input_filename_y) test_x = io.read_data(test_input_filename, None) X_ = copy.deepcopy(X) y_ = copy.deepcopy(y) print "There are " + str(len(X)) + " samples in the train set." print "There are " + str(len(test_x)) + " samples in the test set." test_x = np.matrix(test_x) test_ids = range(1, len(test_x) + 1) # Remove outliers X, y = cl.lof(np.matrix(X), np.matrix(y)) # Shuffle X, y = io.shuffle(X, y) # PCA #X = cl.pca(np.matrix(X), 'pca_explained_variance.png').tolist() #test_x = cl.pca(np.matrix(test_x), None).tolist() # Split data to train and validation set val_ids, val_x, val_y = io.pick_set(X, y, 726) train_ids, train_x, train_y = io.pick_set(X, y, 3200) # Train # cl.lr_cl_train(train_x, train_y, filename=lr_model_filename) cl.lr_cl_load(lr_model_filename) # Validate cl.lr_cl_val(val_x, val_y) # predict pred_class, pred_proba = cl.lr_cl_pred(test_x) # Output io.write_classes('classes_lr2_result.csv', test_ids, pred_class) io.write_probabilities('probabilities_lr2_result.csv', test_ids, pred_proba)
def main(argv): input_filename_x = 'train_data.csv' input_filename_y = 'train_labels.csv' test_input_filename = 'test_data.csv' svc_model_filename = 'svc_classif.pkl' # lr_model_filename = 'lr_classif.pkl' lr_model_filename = 'classif.pkl' rfc_model_filename = 'rfc_classif.pkl' rfc_feat_imp_filename = 'rfc_feat_imp.png' model_comp_result_chart_filename = 'method_comp_res.png' io = lib.io.IO() viz = lib.viz.Viz() cl = lib.cl.CL(io, viz) # Read data X, y = io.read_data(input_filename_x, input_filename_y) test_x = io.read_data(test_input_filename, None) X_ = copy.deepcopy(X) y_ = copy.deepcopy(y) print "There are " + str(len(X)) + " samples in the train set." print "There are " + str(len(test_x)) + " samples in the test set." test_x = np.matrix(test_x) test_ids = range(1, len(test_x) + 1) # PCA etc. X = cl.pca(np.matrix(X), 'pca_explained_variance.png').tolist() # test_x = cl.pca(test_x, None).tolist() val_ids, val_x, val_y = io.pick_set(X, y, 1063) _, no_pca_val_x, no_pca_val_y = io.pick_set(X_, y_, 1063) #train_ids, train_x, train_y = io.pick_set(X, y, 4000) #_, no_pca_train_x, no_pca_train_y = io.pick_set(X_, y_, 4000) # Train # cl.svc_cl_train(train_x, train_y, filename=svc_model_filename) # cl.lr_cl_train(train_x, train_y, filename=lr_model_filename) #cl.rfc_cl_train(no_pca_train_x, no_pca_train_y, # filename=rfc_model_filename, # feat_imp_plot_filename=rfc_feat_imp_filename) cl.svc_cl_load(svc_model_filename) cl.lr_cl_load(lr_model_filename) cl.rfc_cl_load(rfc_model_filename) # validate results = {} results['SVC'] = cl.svc_cl_val(val_x, val_y) results['Linear Regression'] = cl.lr_cl_val(no_pca_val_x, no_pca_val_y) results['Random Forest Classifier'] = cl.rfc_cl_val( no_pca_val_x, no_pca_val_y) # Draw some results viz.model_comp_results(results, model_comp_result_chart_filename) # predict # pred_class, pred_proba = cl.svc_cl_pred(test_x) pred_class, pred_proba = cl.rfc_cl_pred(test_x) # pred_class, pred_proba = cl.lr_cl_pred(test_x) # Output io.write_classes('classes_sub_result.csv', test_ids, pred_class) io.write_probabilities('probabilities_sub_result.csv', test_ids, pred_proba)
def main(argv): input_filename_x = 'train_data.csv' input_filename_y = 'train_labels.csv' test_input_filename = 'test_data.csv' svc_model_filename = 'svc_classif.pkl' lr_model_filename = 'lr_classif.pkl' rfc_model_filename = 'rfc_classif.pkl' rfc_feat_imp_filename = 'rfc_feat_imp.png' model_comp_result_chart_filename = 'method_comp_res.png' io = lib.io.IO() viz = lib.viz.Viz() # Read data X_o, y_o = io.read_data(input_filename_x, input_filename_y) test_x = io.read_data(test_input_filename, None) print "There are " + str(len(X_o)) + " samples in the train set." print "There are " + str(len(test_x)) + " samples in the test set." SVC_ll, SVC_a, RFC_ll, RFC_a, LR_ll, LR_a = [], [], [], [], [], [] for s in range(50): print("Iteration %d" % s) cl = lib.cl.CL(io, viz) X = copy.deepcopy(X_o) y = copy.deepcopy(y_o) X_ = copy.deepcopy(X_o) y_ = copy.deepcopy(y_o) test_x = np.matrix(test_x) test_ids = range(1, len(test_x) + 1) # Remove outliers X, y = cl.lof(np.matrix(X), np.matrix(y)) X_, y_ = cl.lof(np.matrix(X_), np.matrix(y_)) # Shuffle X, y = io.shuffle(X, y) X_, y_ = io.shuffle(X_, y_) # PCA X = cl.pca(np.matrix(X), 'pca_explained_variance.png').tolist() test_x = cl.pca(np.matrix(test_x), None).tolist() val_ids, val_x, val_y = io.pick_set(X, y, 726) _, no_pca_val_x, no_pca_val_y = io.pick_set(X_, y_, 726) train_ids, train_x, train_y = io.pick_set(X, y, 3200) _, no_pca_train_x, no_pca_train_y = io.pick_set(X_, y_, 3200) # Train cl.svc_cl_train(train_x, train_y, filename=svc_model_filename) cl.lr_cl_train(train_x, train_y, filename=lr_model_filename) cl.rfc_cl_train(no_pca_train_x, no_pca_train_y, filename=rfc_model_filename, feat_imp_plot_filename=rfc_feat_imp_filename) # validate ll, a = cl.svc_cl_val(val_x, val_y) SVC_ll.append(ll) SVC_a.append(a) ll, a = cl.lr_cl_val(val_x, val_y) LR_ll.append(ll) LR_a.append(a) ll, a = cl.rfc_cl_val(no_pca_val_x, no_pca_val_y) RFC_ll.append(ll) RFC_a.append(a) # Draw some results results = { 'SVC': (sum(SVC_ll) / len(SVC_ll), sum(SVC_a) / len(SVC_a)), 'Logistic Regression': (sum(LR_ll) / len(LR_ll), sum(LR_a) / len(LR_a)), 'Random Forest Classifier': (sum(RFC_ll) / len(RFC_ll), sum(RFC_a) / len(RFC_a)) } viz.model_comp_results(results, model_comp_result_chart_filename) # predict pred_class, pred_proba = cl.lr_cl_pred(test_x) # Output io.write_classes('classes_sub_result.csv', test_ids, pred_class) io.write_probabilities('probabilities_sub_result.csv', test_ids, pred_proba)
test_ids = range(1, len(test_x) + 1) # load from file if options.load_path is not None: model = load_model(options.load_path + '/' + model_filename, options.load_path + '/' + weights_filename) # train else: # Remove outliers X, y = cl.lfo(np.matrix(X), np.matrix(y)) # Shuffle X, y = io.shuffle(X, y) # Pick val and train set val_ids, val_x, val_y = io.pick_set(X, y, 526) train_ids, train_x, train_y = io.pick_set(X, y, 3400) # train model, history = train_model(train_x, train_y, val_x, val_y) # Save model full_model_filename = options.model_path + '/' + model_filename full_weights_filename = options.model_path + '/' + weights_filename with open(full_model_filename, 'w') as f: f.write(model.to_yaml()) model.save_weights(full_weights_filename) # Print metrics viz.plot_nn_perf(history, options.output_path + '/nn_perf.png')
def main(argv): input_filename_x = 'train_data.csv' input_filename_y = 'train_labels.csv' test_input_filename = 'test_data.csv' svc_model_filename = 'svc_classif.pkl' lr_model_filename = 'lr_classif.pkl' rfc_model_filename = 'rfc_classif.pkl' rfc_feat_imp_filename = 'rfc_feat_imp.png' model_comp_result_chart_filename = 'method_comp_res.png' nn_model_filename = 'nn1.pkl' io = lib.io.IO() viz = lib.viz.Viz() nn = lib.nn.NN(io, viz) cl = lib.cl.CL(io, viz) # Read data print "Reading train data..." X, y = io.read_data(input_filename_x, input_filename_y) y = io.shift_v(y, shift=-1) print "Reading test data..." test_x = io.read_data(test_input_filename, None) print "There are " + str(len(X)) + " samples in the train set." print "There are " + str(len(test_x)) + " samples in the test set." test_x = np.matrix(test_x) test_ids = range(1, len(test_x) + 1) # PCA etc. X = cl.pca(np.matrix(X), 'pca_explained_variance.png').tolist() test_x = cl.pca(test_x, None).tolist() # Split data to train and validation set # mini_batches #ids, batches_x, batches_y = io.split_data(X, y, 100, 100) val_ids, val_x, val_y = io.pick_set(X, y, 563) train_ids, train_x, train_y = io.pick_set(X, y, 3800) nn.initialize(train_x.shape[1], nn1=18, nn2=9, alpha=0.01) #, filename=nn_model_filename) # Train pred, proba, acc = nn.predict(train_x, train_y) print("Train set classification accuray before training: %.4f" % acc) nn.train(train_x, train_y, val_x, val_y, training_steps=100000) nn.save_nn(nn_model_filename) # validate pred, proba, acc = nn.predict(train_x, train_y) print("Train set classification accuray after training: %.4f" % acc) pred, proba, acc = nn.predict(val_x, val_y) print("Validation set classification accuray after training: %.4f" % acc) # Draw some results # viz.model_comp_results(results, model_comp_result_chart_filename) # predict pred_class, pred_proba, _ = nn.predict(test_x) pred_class = io.shift_v(pred_class, shift=1) # Output io.write_classes('nn_classes_sub_result.csv', test_ids, pred_class) io.write_probabilities('nn_probabilities_sub_result.csv', test_ids, pred_proba)